In [1]:
!pip install faiss-cpu transformers sentence-transformers
!pip install torch torchvision --no-cache-dir



In [65]:
import pandas as pd
import numpy as np
import faiss
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Load the dataset
file_path = "/content/merged_mcq_dataset.csv"  # Update to your biology dataset path
dataset = pd.read_csv(file_path, encoding="latin1")
dataset.fillna("", inplace=True)

# Combine all context columns into one
dataset['Combined'] = (
    dataset['Question Text'].astype(str) + " " +
    dataset['Option 1'].astype(str) + " " +
    dataset['Option 2'].astype(str) + " " +
    dataset['Option 3'].astype(str) + " " +
    dataset['Option 4'].astype(str) + " " +
    dataset['Option 5'].astype(str)
)

# Load Sentence-BERT for semantic search
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = semantic_model.encode(dataset['Combined'].tolist(), show_progress_bar=True)

# Create a FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))

# Load your fine-tuned model
model_name = "/content/drive/My Drive/MCQ Question Generation/saved_model"  # Update this path
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
generation_model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the function to retrieve biology context from the dataset
def retrieve_context_biology(k=1):
    # Randomly pick biology-related contexts
    random_indices = np.random.choice(len(dataset), k, replace=False)
    return dataset.iloc[random_indices]['Combined'].tolist()

# Define the function to generate questions for all difficulties
def generate_biology_questions():
    difficulties = ["easy", "medium", "hard"]
    questions = {}

    for difficulty in difficulties:
        # Retrieve a random biology context
        retrieved_context = retrieve_context_biology(k=1)
        context = " ".join(retrieved_context)

        # Create a biology-specific prompt
        # Create a refined biology-specific prompt with difficulty-specific instructions
        prompt = f"""
        Based on the following biology context:
        {context}

        Your task is to generate a **biology multiple-choice question** for the **{difficulty} level**:

        - **For Easy Level**: The question should test basic biology concepts, simple definitions, or fundamental facts that are straightforward and easy to recall.
        - **For Medium Level**: The question should involve intermediate biology concepts, processes, or applications that require some reasoning or understanding of relationships between concepts.
        - **For Hard Level**: The question should test advanced biology concepts, detailed mechanisms, or require critical thinking and analysis of biological principles.

        - Provide exactly **five distinct answer options** labeled a, b, c, d, and e.
        - Only one answer option should be correct.
        - Clearly indicate the correct answer.

        Please output in the following format only:
        - Question: <Your question>
        - a) <Option 1>
        - b) <Option 2>
        - c) <Option 3>
        - d) <Option 4>
        - e) <Option 5>
        - Correct Answer: <Correct option letter>

        Do not include any additional text, explanations, or examples.
        """


        # Tokenize and generate
        input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True)
        output = generation_model.generate(input_ids, max_length=500, temperature=0.7, top_k=50, top_p=0.85)

        # Decode the response
        question = tokenizer.decode(output[0], skip_special_tokens=True)
        questions[difficulty] = question

    return questions

# Example usage
questions = generate_biology_questions()

# Print the questions
for difficulty, question in questions.items():
    print(f"Difficulty: {difficulty.capitalize()}\n{question}\n")

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Unrecognized keys in `rope_scaling` for 'rope_type'='llama3': {'name'}
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Difficulty: Easy

        Based on the following biology context:
        Some of the characteristics of vascular plants are given below, A. Production of seeds B. Presence of tracheid and vessel elements in the xylem. C. Seeds being located in carpels. D. Being homosporous. E. Production of pollen tube. Select the characters found only in angiosperms. A only A and E only B and E only B and C only C only

        Your task is to generate a **biology multiple-choice question** for the **easy level**:

        - **For Easy Level**: The question should test basic biology concepts, simple definitions, or fundamental facts that are straightforward and easy to recall.
        - **For Medium Level**: The question should involve intermediate biology concepts, processes, or applications that require some reasoning or understanding of relationships between concepts.
        - **For Hard Level**: The question should test advanced biology concepts, detailed mechanisms, or require critical thinking

In [66]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
