QUESTION ANSWERING BASED ON FEEDED CONTENT BY LLAMA 3.2

In [26]:
!pip -q install git+https://github.com/huggingface/transformers
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate xformers
!pip -q install langchain
!pip -q install gradio
!pip -q install peft chromadb
!pip -q install unstructured
!pip install -q sentence_transformers
!pip -q install pypdf peft
!pip install langchain_community

from google.colab import drive
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import json
import textwrap
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
import gradio as gr
import random
import time



# drive.mount('/content/drive')

# LLaMA2 7B Chat configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)
model_id = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, token='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')
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, token='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')


# Prompt engineering
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template


# Load PDF and create vector database
loader = PyPDFLoader("/content/NLP PT2 .pdf") # Make sure data.pdf exists in /content/
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50, length_function=len)
pages = loader.load_and_split(text_splitter)
db = Chroma.from_documents(pages, HuggingFaceEmbeddings())

# Define prompts and memory
instruction = "Given the context that has been provided. \n {context}, Answer the following question - \n{question}"
system_prompt = """You are an expert in question and answering.
You will be given a context to answer from. Be precise in your answers wherever possible.
In case you are sure you don't know the answer then you say that based on the context you don't know the answer.
In all other instances you provide an answer to the best of your capability. Cite urls when you can access them related to the context."""


template = get_prompt(instruction, system_prompt)
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=5, return_messages=True)
retriever = db.as_retriever()


# Create pipeline and chatbot class
def create_pipeline(max_new_tokens=512):
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=max_new_tokens, temperature=0.1, device_map="auto") # Added device_map
    return pipe

class GunaBot:
    def __init__(self, memory, prompt, retriever=retriever):
        self.memory = memory
        self.prompt = prompt
        self.retriever = retriever
    def create_chat_bot(self, max_new_tokens=512):
        hf_pipe = create_pipeline(max_new_tokens)
        llm = HuggingFacePipeline(pipeline=hf_pipe)
        qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=self.retriever, memory=self.memory, combine_docs_chain_kwargs={"prompt": self.prompt})
        return qa

# Initialize chatbot and Gradio interface
Guna_bot = GunaBot(memory=memory, prompt=prompt)
bot = Guna_bot.create_chat_bot()

# def clear_llm_memory():
#     bot.memory.clear()

# def update_prompt(sys_prompt):
#     if sys_prompt == "":
#         sys_prompt = system_prompt
#     template = get_prompt(instruction, sys_prompt)
#     prompt = PromptTemplate(template=template, input_variables=["context", "question"])
#     bot.combine_docs_chain.llm_chain.prompt = prompt

# with gr.Blocks() as demo:
#     update_sys_prompt = gr.Textbox(label="Update System Prompt")
#     # chatbot = gr.Chatbot(label="Guna Bot", height=300)
#     msg = gr.Textbox(label="Question")
#     clear = gr.ClearButton([msg, chatbot])
#     clear_memory = gr.Button(value="Clear LLM Memory")

#     def respond(message, chat_history):
#         try:
#             bot_message = bot({"question": message})['answer']
#         except Exception as e:
#             bot_message = f"An error occurred: {e}"
#         chat_history.append((message, bot_message))
#         return "", chat_history

#     msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
#     clear_memory.click(clear_llm_memory)
#     update_sys_prompt.submit(update_prompt, inputs=update_sys_prompt)


# demo.launch(share=True, debug=True) #share=True for public link



def chat_with_model():
    while True:
        user_input = input("Enter your question (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        try:
            bot_message = bot({"question": user_input})['answer']
            print(f"Bot: {bot_message}")
        except Exception as e:
            print(f"An error occurred: {e}")

chat_with_model()

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  db = Chroma.from_documents(pages, HuggingFaceEmbeddings())
Device set to use cuda:0


Enter your question (or type 'exit' to quit): What is semantic analysis and why it is considered difficult


  bot_message = bot({"question": user_input})['answer']
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Bot: [INST]<<SYS>>
You are an expert in question and answering.
You will be given a context to answer from. Be precise in your answers wherever possible.
In case you are sure you don't know the answer then you say that based on the context you don't know the answer.
In all other instances you provide an answer to the best of your capability. Cite urls when you can access them related to the context.
<</SYS>>

Given the context that has been provided. 
 1. What is semantic analysis, and why is it considered difﬁcult? 
Semantic Analysis is a part of Natural Language Processing (NLP) that focuses on understanding the meaning of text. While it seems easy for humans, 
understanding language is tricky for computers because of how complex and subjective human language can be. Semantic Analysis helps machines make 
sense of text by considering the meaning of words, the context in which they are used, and the structure of sentences. 
Parts of Semantic Analysis 
Semantic Analysis of Natural Lang

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Bot: [INST]<<SYS>>
You are an expert in question and answering.
You will be given a context to answer from. Be precise in your answers wherever possible.
In case you are sure you don't know the answer then you say that based on the context you don't know the answer.
In all other instances you provide an answer to the best of your capability. Cite urls when you can access them related to the context.
<</SYS>>

Given the context that has been provided. 
 1. What is semantic analysis, and why is it considered difﬁcult? 
Semantic Analysis is a part of Natural Language Processing (NLP) that focuses on understanding the meaning of text. While it seems easy for humans, 
understanding language is tricky for computers because of how complex and subjective human language can be. Semantic Analysis helps machines make 
sense of text by considering the meaning of words, the context in which they are used, and the structure of sentences. 
Parts of Semantic Analysis 
Semantic Analysis of Natural Lang

MCQ GENERATION USING LLAMA3.2

In [30]:
from google.colab import drive
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import json
import textwrap
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
import random
import time

!pip -q install git+https://github.com/huggingface/transformers
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate xformers
!pip -q install langchain
!pip -q install peft chromadb
!pip -q install unstructured
!pip install -q sentence_transformers
!pip -q install pypdf
!pip install langchain_community

# drive.mount('/content/drive')

bnb_config = BitsAndBytesConfig(
    load_in_4bit=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)
model_id = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, token='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')
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, token='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', device_map="auto")


# Prompt engineering
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """You are a helpful and informative assistant.  Generate multiple-choice questions (MCQs) based on the provided text. Each MCQ should have four options (a, b, c, d), with only one correct answer.  Clearly label the correct answer.  Focus on accuracy and ensure questions accurately reflect the content."""


def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template


def generate_mcqs(pdf_path):
    # Load PDF and create vector database
    loader = PyPDFLoader(pdf_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50, length_function=len)
    pages = loader.load_and_split(text_splitter)
    db = Chroma.from_documents(pages, HuggingFaceEmbeddings())
    retriever = db.as_retriever()

    instruction = "Generate multiple-choice questions (MCQs) with concise questions and one-word or short-answer options (a, b, c, d). Indicate the correct answer for each question based on the provided context:\n{context}"
    template = get_prompt(instruction)
    prompt = PromptTemplate(template=template, input_variables=["context"])

    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.1, device_map="auto")
    llm = HuggingFacePipeline(pipeline=pipe)
    llm_chain = LLMChain(prompt=prompt, llm=llm)

    # Process each chunk and generate MCQs
    all_mcqs = []
    for doc in pages:
        mcqs = llm_chain.run(doc.page_content)
        all_mcqs.append(mcqs)

    return "\n\n".join(all_mcqs)


# Example usage:
pdf_file_path = "/content/test_document.pdf"
mcqs = generate_mcqs(pdf_file_path)
mcqs

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  db = Chroma.from_documents(pages, HuggingFaceEmbeddings())
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'[INST]<<SYS>>\nYou are a helpful and informative assistant.  Generate multiple-choice questions (MCQs) based on the provided text. Each MCQ should have four options (a, b, c, d), with only one correct answer.  Clearly label the correct answer.  Focus on accuracy and ensure questions accurately reflect the content.\n<</SYS>>\n\nGenerate multiple-choice questions (MCQs) with concise questions and one-word or short-answer options (a, b, c, d). Indicate the correct answer for each question based on the provided context:\n1.What is semantic analysis, and why is it considered difﬁcult? Semantic Analysis is a part of Natural Language Processing (NLP) that focuses on understanding the meaning of text. While it seems easy for humans, understanding language is tricky for computers because of how complex and subjective human language can be. Semantic Analysis helps machines make sense of text by considering the meaning of words, the context in which they are used, and the structure of sentences.