In [1]:
from transformers import pipeline, BertTokenizer, BertForQuestionAnswering
# from chromadb import ChromaClient
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "knowledge_base.txt"

In [3]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
text = read_text_file(file_path)
print(text)

Copyright © 2022 by McGraw Hill LLC. All rights reserved. Except as permitted under
the United States Copyright Act of 1976, no part of this publication may be reproduced
or distributed in any form or by any means, or stored in a database or retrieval system,
without the prior written permission of the publisher.
ISBN: 978-1-26-047380-3
MHID:      1-26-047380-5
The material in this eBook also appears in the print version of this title: ISBN: 978-1-
26-047379-7, MHID: 1-26-047379-1.
eBook conversion by codeMantra
Version 1.0
All trademarks are trademarks of their respective owners. Rather than put a trademark
symbol after every occurrence of a trademarked name, we use names in an editorial
fashion only, and to the benefit of the trademark owner, with no intention of infringement
of the trademark. Where such designations appear in this book, they have been printed
with initial caps.
McGraw-Hill Education eBooks are available at special quantity discounts to use as
premiums and sales prom

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Maximum size of each chunk
    chunk_overlap=50  # Overlap between chunks
)

# Split the text into chunks
chunks = text_splitter.split_text(text)


In [6]:
chunks[0]

'Copyright © 2022 by McGraw Hill LLC. All rights reserved. Except as permitted under\nthe United States Copyright Act of 1976, no part of this publication may be reproduced\nor distributed in any form or by any means, or stored in a database or retrieval system,\nwithout the prior written permission of the publisher.\nISBN: 978-1-26-047380-3\nMHID:      1-26-047380-5\nThe material in this eBook also appears in the print version of this title: ISBN: 978-1-\n26-047379-7, MHID: 1-26-047379-1.\neBook conversion by codeMantra\nVersion 1.0\nAll trademarks are trademarks of their respective owners. Rather than put a trademark\nsymbol after every occurrence of a trademarked name, we use names in an editorial\nfashion only, and to the benefit of the trademark owner, with no intention of infringement\nof the trademark. Where such designations appear in this book, they have been printed\nwith initial caps.\nMcGraw-Hill Education eBooks are available at special quantity discounts to use as'

In [7]:
from transformers import pipeline

# Use a model that is fine-tuned for question answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

questions = []



response = qa_pipeline({'question': "Generate multiple choice questions from this text:", 'context': chunks[0]})
# question = response['question'].strip()
# answer = response['answer'].strip()

# options = "Option A, Option B, Option C"
# questions.append({'question': question, 'options': options, 'answer': answer})

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [8]:
response

{'score': 0.02022687904536724,
 'start': 37,
 'end': 56,
 'answer': 'All rights reserved'}

In [9]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import random

# Load the T5 model and tokenizer
model_name = "valhalla/t5-base-qg-hl"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load a model for generating distractors
distractor_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# # Define your text
# text = """Your input text here"""

# # Initialize the RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,  # Maximum size of each chunk
#     chunk_overlap=50  # Overlap between chunks
# )

# # Split the text into chunks
# chunks = text_splitter.split_text(text)

# Function to generate a question
def generate_question(context):
    input_text = f"generate question: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Function to generate distractors
def generate_distractors(context, answer):
    sentences = context.split('.')
    random.shuffle(sentences)
    distractors = []
    for sentence in sentences:
        if answer.lower() not in sentence.lower():
            distractors.append(sentence.strip())
        if len(distractors) >= 2:
            break
    return distractors if len(distractors) == 2 else []

# Generate questions and options from each chunk
questions = []
# for i, chunk in enumerate(chunks):
question = generate_question(chunks[6])

# Extract the answer from the context (simplified approach)
input_ids = tokenizer.encode(question, return_tensors="pt")
outputs = model.generate(input_ids, max_length=150, num_return_sequences=1)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

distractors = generate_distractors(chunks[6], answer)

if len(distractors) == 2:
    options = f"Option A: {answer}, Option B: {distractors[0]}, Option C: {distractors[1]}"
    random.shuffle(options.split(", "))  # Randomize the options
    questions.append({'question': question, 'options': options, 'answer': answer})

    # if len(questions) == 50:
    #     break

# Print or process the generated questions
for q in questions:
    print(f"Question: {q['question']}")
    print(f"Options: {q['options']}")
    print(f"Answer: {q['answer']}")
    print()


  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Question: What is the name of the doctor who is a board certified anesthesiologist
Options: Option A: What is the name of the doctor who is a board certified anesthesiologist?, Option B: Finneran IV, MD and Brian M, Option C: Vrooman, MD, MS, and Kimberly M
Answer: What is the name of the doctor who is a board certified anesthesiologist?

