In [None]:
!pip install faiss-cpu transformers sentence-transformers
!pip install torch torchvision --no-cache-dir

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [3]:
import pandas as pd
import numpy as np
import faiss
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Load the dataset
file_path = "/content/drive/My Drive/MCQ Question Generation/merged_mcq_dataset.csv"  # Update to your biology dataset path
dataset = pd.read_csv(file_path, encoding="latin1")
dataset.fillna("", inplace=True)

# Combine all context columns into one
dataset['Combined'] = (
    dataset['Question Text'].astype(str) + " " +
    dataset['Option 1'].astype(str) + " " +
    dataset['Option 2'].astype(str) + " " +
    dataset['Option 3'].astype(str) + " " +
    dataset['Option 4'].astype(str) + " " +
    dataset['Option 5'].astype(str)
)

# Load Sentence-BERT for semantic search
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = semantic_model.encode(dataset['Combined'].tolist(), show_progress_bar=True)

# Create a FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))

# Load the fine-tuned model
model_name = "/content/drive/My Drive/MCQ Question Generation/saved_model"  # Update this path
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
generation_model = AutoModelForCausalLM.from_pretrained(model_name)

# Retrieve diverse biology contexts using FAISS
def retrieve_diverse_contexts_faiss(k=3):
    # Randomly select embeddings from the FAISS index
    random_indices = np.random.choice(len(dataset), k, replace=False)
    random_embeddings = np.array([embeddings[i] for i in random_indices])

    # Perform a FAISS search on these random embeddings
    _, indices = faiss_index.search(random_embeddings, k=1)  # Retrieve nearest neighbor for each
    return dataset.iloc[indices.flatten()]['Combined'].tolist()

# Generate questions for all difficulties
def generate_biology_questions():
    difficulties = ["easy", "medium", "hard"]
    questions = {}

    for difficulty in difficulties:
        # Retrieve diverse contexts using FAISS
        retrieved_contexts = retrieve_diverse_contexts_faiss(k=1)
        context = " ".join(retrieved_contexts)

        # Create a biology-specific prompt
        prompt = f"""
        Based on the following biology context:
        {context}

        Your task is to generate a **biology multiple-choice question** for the **{difficulty} level**:

        - **For Easy Level**: The question should test basic biology concepts, simple definitions, or fundamental facts that are straightforward and easy to recall.
        - **For Medium Level**: The question should involve intermediate biology concepts, processes, or applications that require some reasoning or understanding of relationships between concepts.
        - **For Hard Level**: The question should test advanced biology concepts, detailed mechanisms, or require critical thinking and analysis of biological principles.

        - Provide exactly **five distinct answer options** labeled a, b, c, d, and e.
        - Only one answer option should be correct.
        - Clearly indicate the correct answer.

        Please output in the following format only:
        - Question: <Your question>
        - a) <Option 1>
        - b) <Option 2>
        - c) <Option 3>
        - d) <Option 4>
        - e) <Option 5>
        - Correct Answer: <Correct option letter>

        Do not include any additional text, explanations, or examples.
        """

        # Tokenize and generate
        input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True)
        output = generation_model.generate(input_ids, max_length=500, temperature=0.7, top_k=50, top_p=0.85)

        # Decode the response
        question = tokenizer.decode(output[0], skip_special_tokens=True)
        questions[difficulty] = question

    return questions

# Example usage
questions = generate_biology_questions()

# Print the questions
for difficulty, question in questions.items():
    print(f"Difficulty: {difficulty.capitalize()}\n{question}\n")


Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Unrecognized keys in `rope_scaling` for 'rope_type'='llama3': {'name'}
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Difficulty: Easy

        Based on the following biology context:
        Followings are some statements regarding photosynthetic pigments. P - Chlorophylls absorb red and blue lights. Q- Chlorophyll a directly participate in the light reactions of photosynthesis. R- All the carotenoids involve in the absorption and dissipation of excessive light. S- Chlorophyll a is more effective for blue and red light. Correct response / responses of the above. Only Q   P and Q  R and S  Only S  Q and S 

        Your task is to generate a **biology multiple-choice question** for the **easy level**:

        - **For Easy Level**: The question should test basic biology concepts, simple definitions, or fundamental facts that are straightforward and easy to recall.
        - **For Medium Level**: The question should involve intermediate biology concepts, processes, or applications that require some reasoning or understanding of relationships between concepts.
        - **For Hard Level**: The question 