In [51]:
import numpy as np
import tensorflow as tf
from pygame.examples.video import answer
from reportlab.lib.randomtext import subjects
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
import spacy
import random



In [52]:
nlp = spacy.load("en_core_web_sm")

In [53]:
doc = nlp("Apple is looking at buying a startup in the United States.")
doc

Apple is looking at buying a startup in the United States.

In [54]:
def preprocess_text(text):
    doc=nlp(text)
    sentence=[sent.text for sent in doc.sents]
    return sentence

def create_training_data(sentence,tokenizer,max_length):
    sentence=tokenizer.texts_to_sequences(sentence)
    padded_sequences=pad_sequences(sentence,maxlen=max_length,padding='post')
    return padded_sequences



def build_lstm_model(vocab_size, max_length, embedding_dim):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dense(64, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model




In [55]:
def find_similar_words(word, num_similar=3):
    word_token = nlp.vocab[word] if word in nlp.vocab else None
    if not word_token or not word_token.has_vector:
        return ["[Distractor]"] * num_similar  # Return placeholders if no vector is found

    # Compute similarity with other words in vocab
    similarities = []
    for token in nlp.vocab:
        if token.is_alpha and token.has_vector and token != word_token:
            similarity = word_token.similarity(token)
            similarities.append((token.text, similarity))

    # Sort and return top similar words
    similarities.sort(key=lambda x: x[1], reverse=True)
    return [word for word, _ in similarities[:num_similar]]

In [56]:
def generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=5):
    sentences = preprocess_text(text)
    selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))

    mcqs = []
    for sentence in selected_sentences:
        doc = nlp(sentence)
        nouns = [token.text for token in doc if token.pos_ == "NOUN"]
        if len(nouns) < 1:
            continue

        subject = random.choice(nouns)
        question_stem = sentence.replace(subject, "______")

        # Generate similar words using spaCy
        similar_words = find_similar_words(subject, num_similar=3)

        answer_choices = [subject] + similar_words
        random.shuffle(answer_choices)
        correct_answer = chr(65 + answer_choices.index(subject))

        mcqs.append((question_stem, answer_choices, correct_answer))

    return mcqs

In [57]:
text = """Deep learning is a subset of machine learning that uses neural networks. LSTMs are useful for processing sequential data like text.
Natural language processing involves techniques like tokenization and named entity recognition."""


In [58]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocess_text(text))
vocab_size = len(tokenizer.word_index) + 1
max_length = 20

# Train LSTM model (Note: Training requires large datasets)
model = build_lstm_model(vocab_size, max_length, embedding_dim=100)

# Generate MCQs
mcqs = generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=3)
for i, (q, choices, ans) in enumerate(mcqs, 1):
    print(f"Q{i}: {q}")
    print(f" A) {choices[0]}  B) {choices[1]}  C) {choices[2]}  D) {choices[3]}")
    print(f"Correct Answer: {ans}\n")

Q1: Natural language ______ involves techniques like tokenization and named entity recognition.
 A) processing  B) [Distractor]  C) [Distractor]  D) [Distractor]
Correct Answer: A

Q2: Deep ______ is a subset of machine ______ that uses neural networks.
 A) [Distractor]  B) [Distractor]  C) [Distractor]  D) learning
Correct Answer: D

Q3: LSTMs are useful for processing sequential ______ like text.

 A) data  B) [Distractor]  C) [Distractor]  D) [Distractor]
Correct Answer: A



In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "voidful/bart-eqg-question-generator"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

context = """Mitosis is a part of the cell cycle in which replicated chromosomes are separated into two new nuclei."""
prompt = f"Generate 3 multiple choice questions from this paragraph:\n{context}"

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  84%|########3 | 598M/712M [00:00<?, ?B/s]

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at voidful/bart-eqg-question-generator and are newly initialized: ['model.decoder.embed_tokens.weight', 'model.encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


who who is is is will is wentc ( is .a ( .a


In [6]:
model.save_pretrained("my_mcq_model")
tokenizer.save_pretrained("my_mcq_model")




('my_mcq_model\\tokenizer_config.json',
 'my_mcq_model\\special_tokens_map.json',
 'my_mcq_model\\vocab.json',
 'my_mcq_model\\merges.txt',
 'my_mcq_model\\added_tokens.json',
 'my_mcq_model\\tokenizer.json')

In [9]:
context = """
Photosynthesis is the process by which green plants, algae, and certain bacteria convert light energy into chemical energy stored in glucose.
This process mainly takes place in the chloroplasts of plant cells, which contain the green pigment chlorophyll that captures light energy.
Photosynthesis occurs in two main stages: the light-dependent reactions and the light-independent reactions (Calvin cycle).
During the light-dependent reactions, sunlight is absorbed by chlorophyll, and the energy is used to split water molecules into oxygen, protons, and electrons.
Oxygen is released as a byproduct, while ATP and NADPH are produced as energy carriers.
In the Calvin cycle, which takes place in the stroma of the chloroplast, ATP and NADPH are used to fix carbon dioxide into glucose.
This glucose provides energy for the plant’s growth, reproduction, and other metabolic activities.
Photosynthesis is crucial for life on Earth as it not only provides oxygen but also serves as the foundation of the food chain.
"""

prompt = f"Generate questions from this paragraph:\n{context}"
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.8,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3
)

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


who whoier is is is wasku to , to ( to to ( ( ( isa ( ) ( ( .a ( ( ) . ( ( a ( ( overaya ( . a
