In [1]:
!pip install openpyxl

Collecting openpyxl


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [13]:
import pandas as pd
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_content(content):
    """Summarizes long chapter content to fit within the model's limit."""
    try:
        # Ensure content is a string
        content = str(content)
        if not content.strip():
            return ""
            
        token_limit = 1024
        tokenized_length = len(content.split())
        
        if tokenized_length > token_limit:
            summary = summarizer(content, max_length=300, min_length=100, do_sample=False)[0]['summary_text']
            return summary
        return content
    except Exception as e:
        print(f"Error in summarization: {e}")
        return content

class QuizGenerator:
    def __init__(self, model_name="gpt2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        
        # Properly set up the tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.config.pad_token_id = self.model.config.eos_token_id
        
    def generate_question_from_content(self, content):
        """Generates a multiple-choice question based on chapter content."""
        try:
            if not content or not str(content).strip():
                return "No valid content to generate a question."
            
            # Summarize content
            summarized_content = summarize_content(content)
            if not summarized_content:
                return "Failed to process content."
            
            # Create prompt
            prompt = (
                "Generate a multiple-choice question with 4 options based on this content. "
                "Include the correct answer marked with [CORRECT]. The question should be "
                f"educational and clear:\n\n{summarized_content}\n\nQuestion:"
            )
            
            # Encode with proper handling
            encoded = self.tokenizer.encode_plus(
                prompt,
                add_special_tokens=True,
                return_tensors="pt",
                padding='max_length',
                max_length=512,  # Reduced max_length to avoid position embedding issues
                truncation=True,
                return_attention_mask=True
            )
            
            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=encoded['input_ids'],
                    attention_mask=encoded['attention_mask'],
                    max_new_tokens=150,
                    pad_token_id=self.tokenizer.pad_token_id,
                    num_return_sequences=1,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    no_repeat_ngram_size=2,
                    early_stopping=True
                )
            
            # Decode and clean up
            question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            question = question.replace(prompt, "").strip()
            
            return question if question else "Failed to generate question."
            
        except Exception as e:
            print(f"Error in question generation: {e}")
            return "Error generating question."

def load_dataset(filepath: str) -> pd.DataFrame:
    """Loads dataset from an Excel file."""
    try:
        df = pd.read_excel(filepath)
        # Ensure the required column exists
        if "Chapter_content" not in df.columns:
            print("Warning: 'Chapter_content' column not found in dataset")
            return pd.DataFrame()
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return pd.DataFrame()

def create_quiz_from_data(data: pd.DataFrame, quiz_generator: QuizGenerator):
    """Creates quiz questions from the dataset."""
    questions = []
    
    try:
        # Process each row
        for idx, row in data.iterrows():
            print(f"Generating question {idx + 1}/{len(data)}...")
            
            # Get content safely
            content = str(row.get("Chapter_content", "")).strip()
            if not content:
                continue
                
            # Generate question
            question = quiz_generator.generate_question_from_content(content)
            if question and question != "Failed to generate question.":
                questions.append(question)
            
            # Limit number of questions if needed
            if len(questions) >= 5:  # Adjust this number as needed
                break
                
    except Exception as e:
        print(f"Error in quiz creation: {e}")
    
    return questions

def main():
    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load dataset
    dataset_path = "dataset_syllabus.xlsx"
    data = load_dataset(dataset_path)
    
    if data.empty:
        print("Dataset is empty or could not be loaded.")
        return
    
    print(f"Loaded dataset with {len(data)} rows")
    
    # Initialize quiz generator
    quiz_generator = QuizGenerator()
    
    # Generate questions
    print("Generating questions...")
    questions = create_quiz_from_data(data, quiz_generator)
    
    # Print results
    print("\nGenerated Questions:")
    for idx, question in enumerate(questions, start=1):
        print(f"\nQuestion {idx}:")
        print(question)
        print("-" * 50)

if __name__ == "__main__":
    main()

Device set to use cpu


Using device: cpu
Loaded dataset with 277 rows
Generating questions...
Generating question 1/277...




Generating question 2/277...
Generating question 3/277...
Generating question 4/277...
Generating question 5/277...

Generated Questions:

Question 1:
Generate a multiple-choice question with 4 options based on this content. Include the correct answer marked with [CORRECT]. The question should be educational and clear:

Chapter 1: The Living World
What is Living?
Living organisms are highly organized structures that exhibit growth, reproduction, metabolism, and response to stimuli.
The key characteristics of living beings include:
Growth – Increase in size or number of cells.
Reproduction – Ability to produce offspring.
Metabolism – Sum of all chemical reactions in an organism.
Cellular Organization – All living things are made of cells.
Response to Stimuli (Consciousness) – Ability to react to changes in the environment.
Characteristics of Living Organisms
1. Growth
Definition: Growth refers to the increase in the mass and size of an organism.
In Living Organisms: Occurs by cell divis

In [None]:
working code:

In [1]:
import pandas as pd
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_content(content):
    """Summarizes long chapter content to fit within the model's limit."""
    try:
        content = str(content).strip()
        if not content:
            return ""
            
        token_limit = 512  # Lower limit for better question relevance
        if len(content.split()) > token_limit:
            summary = summarizer(content, max_length=250, min_length=100, do_sample=False)[0]['summary_text']
            return summary
        return content
    except Exception as e:
        print(f"Error in summarization: {e}")
        return content

class QuizGenerator:
    def __init__(self, model_name="mistralai/Mistral-7B-Instruct"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
        self.model.eval()

    def generate_question_from_content(self, content):
        """Generates a structured multiple-choice question based on chapter content."""
        try:
            if not content:
                return "No valid content to generate a question."
            
            summarized_content = summarize_content(content)
            if not summarized_content:
                return "Failed to process content."

            prompt = (
                "You are a quiz generator AI. Given the following educational content, create a multiple-choice question "
                "with 4 answer options, clearly marking the correct answer as '[CORRECT]'. Ensure the question is educational, "
                "clear, and follows a standard MCQ format.\n\n"
                f"Content: {summarized_content}\n\n"
                "Question:\n"
            )

            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_length=256, temperature=0.7, top_p=0.9)

            question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return question if question else "Failed to generate question."
        except Exception as e:
            print(f"Error in question generation: {e}")
            return "Error generating question."

def load_dataset(filepath: str) -> pd.DataFrame:
    """Loads dataset from an Excel file."""
    try:
        df = pd.read_excel(filepath)
        if "Chapter_content" not in df.columns:
            print("Warning: 'Chapter_content' column not found in dataset")
            return pd.DataFrame()
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return pd.DataFrame()

def create_quiz_from_data(data: pd.DataFrame, quiz_generator: QuizGenerator):
    """Creates quiz questions from the dataset."""
    questions = []
    
    for idx, row in data.iterrows():
        print(f"Generating question {idx + 1}/{len(data)}...")
        content = str(row.get("Chapter_content", "")).strip()
        if not content:
            continue
            
        question = quiz_generator.generate_question_from_content(content)
        if question and "Failed to generate" not in question:
            questions.append(question)
        
        if len(questions) >= 5:  # Limit number of questions
            break
            
    return questions

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    dataset_path = "dataset_syllabus.xlsx"
    data = load_dataset(dataset_path)
    
    if data.empty:
        print("Dataset is empty or could not be loaded.")
        return
    
    print(f"Loaded dataset with {len(data)} rows")
    
    quiz_generator = QuizGenerator()
    print("Generating questions...")
    questions = create_quiz_from_data(data, quiz_generator)
    
    print("\nGenerated Questions:")
    for idx, question in enumerate(questions, start=1):
        print(f"\nQuestion {idx}:")
        print(question)
        print("-" * 50)

if __name__ == "__main__":
    main()

RuntimeError: Failed to import transformers.models.bart.modeling_tf_bart because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [3]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import random
from typing import List, Dict, Tuple

class QuizGenerator:
    def __init__(self, model_name: str = "gpt2"):
        """Initialize the quiz generator with a pre-trained model."""
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.model.eval()

    def generate_question_from_content(self, content: str) -> str:
        """Generate a multiple-choice question from the given chapter content."""
        prompt = f"Generate a multiple-choice question based on the following chapter content: {content}\nQuestion:"
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        outputs = self.model.generate(
            inputs,
            max_length=100,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7
        )
        question = self.tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()
        return question

    def generate_options(self, correct_answer: str, content: str) -> List[str]:
        """Generate plausible options including the correct answer based on chapter content."""
        options = [correct_answer]
        prompt = f"Given the chapter content: {content}, and the correct answer: {correct_answer}, generate an alternative answer:"
        
        while len(options) < 4:
            inputs = self.tokenizer.encode(prompt, return_tensors="pt")
            outputs = self.model.generate(
                inputs,
                max_length=50,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                temperature=0.8
            )
            new_option = self.tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()
            if new_option and new_option not in options:
                options.append(new_option)
        
        random.shuffle(options)
        return options

class Quiz:
    def __init__(self, chapter_content: str):
        """Initialize a quiz with chapter content."""
        self.chapter_content = chapter_content
        self.questions = []
        self.current_question = 0
        self.score = 0

    def add_question(self, question: str, options: List[str], correct_answer: str):
        """Add a question to the quiz."""
        self.questions.append({
            "question": question,
            "options": options,
            "correct_answer": correct_answer
        })

    def check_answer(self, answer: str) -> bool:
        """Check if the provided answer is correct."""
        if self.current_question >= len(self.questions):
            return False
        is_correct = answer == self.questions[self.current_question]["correct_answer"]
        if is_correct:
            self.score += 1
        self.current_question += 1
        return is_correct

    def get_current_question(self) -> Dict:
        """Get the current question."""
        if self.current_question >= len(self.questions):
            return None
        return self.questions[self.current_question]

    def get_score(self) -> Tuple[int, int]:
        """Get the current score and total questions."""
        return self.score, len(self.questions)

def create_quiz_from_data(data: pd.DataFrame, quiz_generator: QuizGenerator) -> Quiz:
    """Create a quiz from the dataset using only chapter content."""
    row = data.sample(1).iloc[0]
    Chapter_content = row['Chapter_content']
    quiz = Quiz(Chapter_content)
    num_questions = 5  # Adjustable number of questions
    for _ in range(num_questions):
        question = quiz_generator.generate_question_from_content(Chapter_content)
        correct_answer = quiz_generator.generate_question_from_content(Chapter_content)
        options = quiz_generator.generate_options(correct_answer, Chapter_content)
        quiz.add_question(question, options, correct_answer)
    return quiz

# Load the dataset
def load_dataset(filepath: str) -> pd.DataFrame:
    return pd.read_excel(filepath)

# Example usage
def main():
    dataset_path = "dataset_syllabus.xlsx"
    data = load_dataset(dataset_path)
    quiz_generator = QuizGenerator()
    quiz = create_quiz_from_data(data, quiz_generator)
    
    while True:
        question = quiz.get_current_question()
        if not question:
            break
        print("\nQuestion:", question['question'])
        for i, option in enumerate(question['options']):
            print(f"{chr(65 + i)}. {option}")
        answer = input("\nYour answer (A/B/C/D): ").upper()
        if answer in ['A', 'B', 'C', 'D']:
            selected_answer = question['options'][ord(answer) - ord('A')]
            is_correct = quiz.check_answer(selected_answer)
            print("Correct!" if is_correct else "Incorrect!")
    
    score, total = quiz.get_score()
    print(f"\nFinal Score: {score}/{total}")

if __name__ == "__main__":
    main()


Token indices sequence length is longer than the specified maximum sequence length for this model (4062 > 1024). Running this sequence through the model will result in indexing errors
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


ValueError: Input length of input_ids is 4062, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [34]:
 def generate_options(self, correct_answer: str, content: str) -> List[str]:
        """Generate plausible options including the correct answer."""
        options = [correct_answer]
        prompt = f"Given the topic: {content}, and the correct answer: {correct_answer}, generate an alternative answer:"
        
        while len(options) < 4:
            inputs = self.tokenizer.encode(prompt, return_tensors="pt")
            outputs = self.model.generate(
                inputs,
                max_length=50,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                temperature=0.8
            )
            new_option = self.tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()
            if new_option and new_option not in options:
                options.append(new_option)
        
        random.shuffle(options)
        return options

class Quiz:
    def __init__(self, board: str, grade: int, subject: str, chapter: str):
        """Initialize a quiz with metadata."""
        self.board = board
        self.grade = grade
        self.subject = subject
        self.chapter = chapter
        self.questions = []
        self.current_question = 0
        self.score = 0

    def add_question(self, question: str, options: List[str], correct_answer: str):
        """Add a question to the quiz."""
        self.questions.append({
            "question": question,
            "options": options,
            "correct_answer": correct_answer
        })

    def check_answer(self, answer: str) -> bool:
        """Check if the provided answer is correct."""
        if self.current_question >= len(self.questions):
            return False
        is_correct = answer == self.questions[self.current_question]["correct_answer"]
        if is_correct:
            self.score += 1
        self.current_question += 1
        return is_correct

    def get_current_question(self) -> Dict:
        """Get the current question."""
        if self.current_question >= len(self.questions):
            return None
        return self.questions[self.current_question]

    def get_score(self) -> Tuple[int, int]:
        """Get the current score and total questions."""
        return self.score, len(self.questions)

In [35]:
def create_quiz_from_data(data: pd.DataFrame, quiz_generator: QuizGenerator) -> Quiz:
    """Create a quiz from the dataset."""
    row = data.sample(1).iloc[0]
    quiz = Quiz(
        board=row['Board'],
        grade=row['Grade'],
        subject=row['Subject'],
        chapter=row['Chapter']
    )
    content = row['chapter_content']
    num_questions = 5  # Adjustable number of questions
    for _ in range(num_questions):
        question = quiz_generator.generate_question_from_content(content)
        correct_answer = quiz_generator.generate_question_from_content(content)
        options = quiz_generator.generate_options(correct_answer, content)
        quiz.add_question(question, options, correct_answer)
    return quiz

# Load the dataset
def load_dataset(filepath: str) -> pd.DataFrame:
    return pd.read_excel(filepath)

# Example usage
def main():
    dataset_path = "dataset_syllabus.xlsx"
    data = load_dataset(dataset_path)
    quiz_generator = QuizGenerator()
    quiz = create_quiz_from_data(data, quiz_generator)

In [36]:
    while True:
        question = quiz.get_current_question()
        if not question:
            break
        print("\nQuestion:", question['question'])
        for i, option in enumerate(question['options']):
            print(f"{chr(65 + i)}. {option}")
        answer = input("\nYour answer (A/B/C/D): ").upper()
        if answer in ['A', 'B', 'C', 'D']:
            selected_answer = question['options'][ord(answer) - ord('A')]
            is_correct = quiz.check_answer(selected_answer)
            print("Correct!" if is_correct else "Incorrect!")
    
    score, total = quiz.get_score()
    print(f"\nFinal Score: {score}/{total}")

if __name__ == "__main__":
        main()


NameError: name 'quiz' is not defined

In [14]:
class Quiz:
    def __init__(self, board: str, grade: int, subject: str, chapter: str):
        """Initialize a quiz with metadata."""
        self.board = board
        self.grade = grade
        self.subject = subject
        self.chapter = chapter
        self.questions = []
        self.current_question = 0
        self.score = 0

In [15]:
def add_question(self, question: str, options: List[str], correct_answer: str):
        """Add a question to the quiz."""
        self.questions.append({
            "question": question,
            "options": options,
            "correct_answer": correct_answer
        })

In [16]:
def check_answer(self, answer: str) -> bool:
        """Check if the provided answer is correct."""
        if self.current_question >= len(self.questions):
            return False
        
        is_correct = answer == self.questions[self.current_question]["correct_answer"]
        if is_correct:
            self.score += 1
        self.current_question += 1
        return is_correct

In [20]:
def get_current_question(self) -> Dict:
        """Get the current question."""
        if self.current_question >= len(self.questions):
            return None
        return self.questions[self.current_question]

def get_score(self) -> Tuple[int, int]:
        """Get the current score and total questions."""
        return self.score, len(self.questions)

In [24]:
def create_quiz_from_data(data: pd.DataFrame, quiz_generator: QuizGenerator) -> Quiz:
    """Create a quiz from the provided dataset."""
    # Sample implementation - you'll need to adapt this to your actual data structure
    row = data.sample(1).iloc[0]
    
    quiz = Quiz(
        board=row['Board'],
        grade=row['Grade'],
        subject=row['Subject'],
        chapter=row['Chapter_title']
    )
    
    # Generate questions from chapter content
    content = row['Chapter_content']
    

In [25]:
num_questions = 20  # You can adjust this number
for _ in range(num_questions):
        # Generate question and correct answer
        question = quiz_generator.generate_question_from_content(content)
        correct_answer = quiz_generator.generate_question_from_content(content)  # This is simplified
        
        # Generate options
        options = quiz_generator.generate_options(correct_answer, content)
        
        # Add question to quiz
        quiz.add_question(question, options, correct_answer)
    
return quiz

NameError: name 'quiz_generator' is not defined