In [None]:
!pip install faiss-cpu transformers




In [None]:
!pip install torch torchvision --no-cache-dir

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
from transformers import AutoModelForCausalLM, AutoTokenizer

# Step 1: Load the dataset
file_path = 'merged_mcq_dataset.csv'  # Update this path to your file
dataset = pd.read_csv(file_path, encoding='latin1')  # Use appropriate encoding

# Step 2: Fill missing values with an empty string
dataset.fillna("", inplace=True)

# Step 3: Combine question and options into a single field for context
dataset['Combined'] = (
    dataset['Question Text'].astype(str) + " " +
    dataset['Option 1'].astype(str) + " " +
    dataset['Option 2'].astype(str) + " " +
    dataset['Option 3'].astype(str) + " " +
    dataset['Option 4'].astype(str) + " " +
    dataset['Option 5'].astype(str)
)

# Step 4: Vectorize the combined text using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
vectors = vectorizer.fit_transform(dataset['Combined']).toarray()

# Step 5: Create FAISS index
dimension = vectors.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(vectors)

print("FAISS index created successfully!")

# Step 6: Function to retrieve relevant context
def retrieve_context(query, k=3):
    query_vector = vectorizer.transform([query]).toarray()
    _, indices = faiss_index.search(query_vector, k)
    return dataset.iloc[indices[0]]

# Step 7: Load the language model and tokenizer
model_name = "./saved_model"  # Update this to your saved model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 8: Generate a question using retrieved context
def generate_question(query, difficulty="easy"):
    # Retrieve relevant context
    retrieved_context = retrieve_context(query)
    context = " ".join(retrieved_context['Combined'].tolist())

    # Create prompt
    prompt = f"""
                Using the following context:
                {context}

                Your task is to generate **single multiple-choice biology question** for the {difficulty} level. Ensure the question is clear, concise, and directly related to the provided context. 

                The format should include:
                - One question
                - Five answer options labelled as a, b, c, d and e
                - Indicate which option is the correct answer.

                Do not include example questions or extra text.

                Output format:
                Question: <Your question here>
                a) <Option 1>
                b) <Option 2>
                c) <Option 3>
                d) <Option 4>
                e) <Option 5>
                Correct Answer: <Correct option letter>
                """

    # Tokenize prompt and generate response
    input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True)
    output = model.generate(input_ids, max_length=1000, temperature=0.7, top_k=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
difficulty = "easy"
query = f"Generate a biology question for {difficulty} level."
generated_mcq = generate_question(query, difficulty=difficulty)
print(generated_mcq)


FAISS index created successfully!


Unrecognized keys in `rope_scaling` for 'rope_type'='llama3': {'name'}
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



                Using the following context:
                In human female, after parturition There will be no change in the blood progesterone level of mother Secretion of PIH increases gradually There can be periodic increase of Oestradiol level Degeneration of corpus luteum High level of Oestradiol inhibits secretion of GnRH from pituitary gland Correct statement regarding feedback mechanism of endocrine gland Most of the hormones are regulated by positive feed back Some hormone levels directly regulated by blood level stimuli Signals from hypothalamus to inhibit heat gain are inhibited by positive feed back When temperature rises above normal level, secretion of thyroid hormone is stimulated When blood osmolarity rises beyond normal level, ADH is not secreted  In the order and organizational level of living matter, in which level plant leaf can be placed? Molecule Organ Cell Tissue Organ system

                Your task is to generate **single multiple-choice biology question**