### Question and Answering Chatbot - RAG frame work

### Import libraries 

In [None]:
import os
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
#from transformers import AutoTokenizer, AutoModelForQuestionAnswering
#from datasets import load_dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
#from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

### Load quantized Mistal 7B

In [None]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,cache_dir="./model_data")### we can create model_data folderin AWS then 
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
model_name, cache_dir="./model_data",### Mixtral 8*7B model saved in current model_data directory in AWS
   quantization_config=bnb_config,
)

### Lets try with Mistral 7b pre-trained model with 4bit precision

In [None]:
inputs_not_chat = tokenizer.encode_plus("[INST] Tell me about fantasy football? [/INST]", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

### Count number of trainable parameters

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

### Build Mistral text generation pipeline

In [None]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

In [None]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

### Load and chunk documents. Load chunked documents into FAISS index

In [None]:
import langchain

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [None]:
doc=read_doc('documents/')
len(doc)

In [None]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(doc)
print(doc[:3])




In [None]:
# Define the path to the pre-trained model you want to use
#modelPath = "sentence-transformers/all-MiniLM-l6-v2"
modelPath='sentence-transformers/all-mpnet-base-v2'

# Create a dictionary with model configuration options, specifying to use the CPU for computations
#model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    #model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)


In [None]:
### Embeddings 
text = "what is encoder?"
query_result = embeddings.embed_query(text)
query_result[:3]

In [None]:
### FAISS vector Stores DB
db = FAISS.from_documents(docs, embeddings)

In [None]:
### Search for vectors or Testing FAISS Vector DB
question = "What is encoder?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

In [None]:
### FAISS DB retriver 
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 3}
)

### Create PromptTemplate and LLMChain

In [None]:
prompt_template = """
### [INST] Instruction: Answer the question based on your fantasy football knowledge. Here is context to help:

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [None]:
llm_chain.invoke({"context": "", "question": "What is BERT?"})

### Build RAG Chain

In [None]:
rag_chain = ( 
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

result = rag_chain.invoke("What is BERT?")

In [None]:
result['context']

In [None]:
print(result['text'])

In [None]:
import os
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer  # Assuming you have installed the trl library

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

#################################################################
# Tokenizer
#################################################################

model_name = 'mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir="./model_data")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# Bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading (adjust as needed)
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

#################################################################
# LoRA attention dimension (Optional for fine-tuning)
#################################################################

lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Load the model with LoRA configuration (uncomment for LoRA)
lora_config = LoraConfig(r=lora_r, alpha=lora_alpha, dropout=lora_dropout)
model = PeftModel.from_pretrained(
    model_name, cache_dir="./model_data",
   quantization_config=bnb_config,
   lora_config=lora_config
)

#model = PeftModel.from_pretrained(model_name, cache_dir="./model_data", quantization_config=bnb_config)

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
#################################################################
# Text Processing
#################################################################

def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

doc = read_doc('documents/')
len(doc)

# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(doc)

# Define the path to the pre-trained sentence transformer model
modelPath = 'sentence-transformers/all-mpnet-base-v2'

# Create HuggingFaceEmbeddings for encoding documents
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    encode_kwargs={'normalize_embeddings': False}  # Experiment with normalization
)

# Create FAISS vector store for document retrieval
db = FAISS.from_documents(docs, embeddings)

# Create FAISS retriever for retrieving relevant documents
retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': 3})

# Prompt template for guiding the LLM
prompt_template = """
### [INST] Instruction: Answer the question based on your fantasy football knowledge. Here is context to help:

{context}

### QUESTION:
{question} [/INST]
"""

# Create prompt from the template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create the LLM chain with the Mistral LLM and prompt
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

# Combine retriever and LLM chain into a RAG chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

# Function to invoke the RAG chain with a question
def answer_question(question):
  result = rag_chain.invoke({"question": question})
  return result['text']

# Example usage
question = "What is the salary cap in fantasy football?"
answer = answer_question(question)
print(f"Answer: {answer}")


In [None]:
import os
import json
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader  # Assuming PDFs
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

#################################################################
# Tokenizer
#################################################################

model_name = 'mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir="./model_data")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# LoRA parameters
#################################################################

lora_r = 64  # LoRA attention dimension
lora_alpha = 16  # LoRA scaling alpha
lora_dropout = 0.1  # LoRA dropout probability

#################################################################
# Load pre-trained model with LoRA
#################################################################

lora_config = LoraConfig(r=lora_r, alpha=lora_alpha, dropout=lora_dropout)
model = PeftModel.from_pretrained(
    model_name, cache_dir="./model_data", lora_config=lora_config
)

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

#################################################################
# Text Processing
#################################################################

# Load documents from directory (assuming PDFs)
pdf_loader = PyPDFDirectoryLoader("documents/")
documents = pdf_loader.load()

# Split text into chunks (adjust chunk size and overlap as needed)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)

#################################################################
# Generate Training Data from Documents (Improved Approach)
#################################################################

# This section generates question-answer pairs from your documents,
# including research papers. It leverages techniques like named entity recognition
# and keyword matching to identify relevant sections.

training_data = []

for doc in docs:
  # Identify relevant sections (e.g., using named entity recognition or keyword matching)
  sections = []
  # Replace with your implementation to identify sections like Introduction, Methodology, etc.
  # You can use libraries like spaCy or NLTK for named entity recognition

  # Process each identified section
  for section in sections:
    # Formulate questions and answers based on section content
    question_template = PromptTemplate(
        input_string="{section_content}\n"
        "What are the key findings or contributions of this section?"
    )  # Example prompt template (replace with more specific prompts)

    processed_section = tokenizer(section, return_tensors="pt")["input_ids"].squeeze()
    answer = model.generate(processed_section, max_length=512)  # Generate answer based on prompt and section
    training_data.append({
        "question": question_template.format(section_content=section),
        "answer": tokenizer.decode(answer[0], skip_special_tokens=True)
    })

################################################################


In [None]:
#################################################################
# Embedding Layer (Replace with your implementation)
# This step converts text data (questions and answers) into numerical representations.

# Example (Replace with actual embedding code)
# embedding_model = SentenceTransformers('all-mpnet-base-v2')  # Example embedding model
# question_embeddings = embedding_model.encode(training_data['question'])
# answer_embeddings = embedding_model.encode(training_data['answer'])

# 2. Retrieval (Replace with your implementation)
# This step retrieves relevant answers from your knowledge base (documents) based on the question embeddings.

# Example (Replace with actual retrieval code)
# retriever = FAISS()  # Example retrieval method
# retriever.fit(answer_embeddings)

# 3. SFT Training (Replace with your implementation)
# This step trains the model using the question-answer pairs and the retrieval system.

# Example (Replace with actual SFT training code)
# sft_trainer = SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     retriever=retriever,
#     training_data=training_data
# )
# sft_trainer.train()

# 4. Create RAG Chain (Replace with your implementation)
# This step combines the retrieval and generation (LLM) components into a single system
# for efficient question answering.

# Example (Replace with actual RAG chain code)
# rag_chain = LLMChain(
#     question_embedding_model=embedding_model,
#     retriever=retriever,
#     llm_model=model,
#     tokenizer=tokenizer
# )

# Now you can use the `rag_chain` object to answer your questions using the trained model and retrieved documents.
