In [1]:
import shutil
import requests
import sys
from typing import Optional, List, Tuple
from langchain_core.language_models import BaseChatModel
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import config 

In [2]:
pdfPath = config.pdfPath

In [3]:
if pdfPath is None:
    raise ValueError("pdfPath is None. Please set the  pdf path in config.py.")

In [4]:
loader = PyPDFLoader(pdfPath)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,  
        chunk_overlap=200,
        add_start_index=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )
try:
    langchain_docs = loader.load_and_split(text_splitter=text_splitter) #loads and slits
    #docs = loader.load()
    #langchain_docs = text_splitter.split_documents(docs)
except Exception as e:
    raise ValueError("An error occurred:", e)

In [6]:
##creating Vector DB

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embeddingModelName = "BAAI/bge-base-en-v1.5"

embeddingModel = HuggingFaceEmbeddings(model_name=embeddingModelName)

db = FAISS.from_documents(langchain_docs, embeddingModel)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
##Loading the Model to answer questions
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = 'HuggingFaceH4/zephyr-7b-beta'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [8]:
##Creating base Model Chain
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=200,
    pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [9]:
##Creating Context Chain
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever()

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [10]:
import pandas as pd
from datasets import Dataset, DatasetDict
# generated_questions = pd.read_csv("datasets/db2_dataset.csv")
from datasets import load_dataset
datapoints = load_dataset("Ubaidbhat/StockInvestingForDummies", split = "train")

Downloading readme:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 595k/595k [00:00<00:00, 1.14MB/s]


Generating train split:   0%|          | 0/791 [00:00<?, ? examples/s]

In [12]:
outputs = []
i = 1
n = len(datapoints)
for datapoint in datapoints:
    print("Inference number {}/{} in progress.....".format(i, n))
    if i >= 591 and i <= len(datapoints) - 10:
        question = datapoint["question"]
        correctAnswer = datapoint["answer"]
        ragAnswer = rag_chain.invoke(question)
        baseAnswer = llm_chain.invoke({"context":"", "question": question}) 
        outputs.append(
                {   
                    "question": question,
                    "correctAnswer": correctAnswer,
                    "ragAnswer": ragAnswer,
                    "baseModelAnswer":baseAnswer 
                
                }
            )
        generated_questions = pd.DataFrame.from_dict(outputs)
        generated_questions.to_csv("StocksQAWithZephr1.csv", index=False)
    i += 1

Inference number 1/791 in progress.....
Inference number 2/791 in progress.....
Inference number 3/791 in progress.....
Inference number 4/791 in progress.....
Inference number 5/791 in progress.....
Inference number 6/791 in progress.....
Inference number 7/791 in progress.....
Inference number 8/791 in progress.....
Inference number 9/791 in progress.....
Inference number 10/791 in progress.....
Inference number 11/791 in progress.....
Inference number 12/791 in progress.....
Inference number 13/791 in progress.....
Inference number 14/791 in progress.....
Inference number 15/791 in progress.....
Inference number 16/791 in progress.....
Inference number 17/791 in progress.....
Inference number 18/791 in progress.....
Inference number 19/791 in progress.....
Inference number 20/791 in progress.....
Inference number 21/791 in progress.....
Inference number 22/791 in progress.....
Inference number 23/791 in progress.....
Inference number 24/791 in progress.....
Inference number 25/791 i



Inference number 597/791 in progress.....




Inference number 598/791 in progress.....




Inference number 599/791 in progress.....




Inference number 600/791 in progress.....




Inference number 601/791 in progress.....




Inference number 602/791 in progress.....




Inference number 603/791 in progress.....




Inference number 604/791 in progress.....




Inference number 605/791 in progress.....




Inference number 606/791 in progress.....




Inference number 607/791 in progress.....




Inference number 608/791 in progress.....




Inference number 609/791 in progress.....




Inference number 610/791 in progress.....




Inference number 611/791 in progress.....




Inference number 612/791 in progress.....




Inference number 613/791 in progress.....




Inference number 614/791 in progress.....




Inference number 615/791 in progress.....




Inference number 616/791 in progress.....




Inference number 617/791 in progress.....




Inference number 618/791 in progress.....




Inference number 619/791 in progress.....




Inference number 620/791 in progress.....




Inference number 621/791 in progress.....




Inference number 622/791 in progress.....




Inference number 623/791 in progress.....




Inference number 624/791 in progress.....




Inference number 625/791 in progress.....




Inference number 626/791 in progress.....




Inference number 627/791 in progress.....




Inference number 628/791 in progress.....




Inference number 629/791 in progress.....




Inference number 630/791 in progress.....




Inference number 631/791 in progress.....




Inference number 632/791 in progress.....




Inference number 633/791 in progress.....




Inference number 634/791 in progress.....




Inference number 635/791 in progress.....




Inference number 636/791 in progress.....




Inference number 637/791 in progress.....




Inference number 638/791 in progress.....




Inference number 639/791 in progress.....




Inference number 640/791 in progress.....




Inference number 641/791 in progress.....




Inference number 642/791 in progress.....




Inference number 643/791 in progress.....




Inference number 644/791 in progress.....




Inference number 645/791 in progress.....




Inference number 646/791 in progress.....




Inference number 647/791 in progress.....




Inference number 648/791 in progress.....




Inference number 649/791 in progress.....




Inference number 650/791 in progress.....




Inference number 651/791 in progress.....




Inference number 652/791 in progress.....




Inference number 653/791 in progress.....




Inference number 654/791 in progress.....




Inference number 655/791 in progress.....




Inference number 656/791 in progress.....




Inference number 657/791 in progress.....




Inference number 658/791 in progress.....




Inference number 659/791 in progress.....




Inference number 660/791 in progress.....




Inference number 661/791 in progress.....




Inference number 662/791 in progress.....




Inference number 663/791 in progress.....




Inference number 664/791 in progress.....




Inference number 665/791 in progress.....




Inference number 666/791 in progress.....




Inference number 667/791 in progress.....




Inference number 668/791 in progress.....




Inference number 669/791 in progress.....




Inference number 670/791 in progress.....




Inference number 671/791 in progress.....




Inference number 672/791 in progress.....




Inference number 673/791 in progress.....




Inference number 674/791 in progress.....




Inference number 675/791 in progress.....




Inference number 676/791 in progress.....




Inference number 677/791 in progress.....




Inference number 678/791 in progress.....




Inference number 679/791 in progress.....




Inference number 680/791 in progress.....




Inference number 681/791 in progress.....




Inference number 682/791 in progress.....




Inference number 683/791 in progress.....




Inference number 684/791 in progress.....




Inference number 685/791 in progress.....




Inference number 686/791 in progress.....




Inference number 687/791 in progress.....




Inference number 688/791 in progress.....




Inference number 689/791 in progress.....




Inference number 690/791 in progress.....




Inference number 691/791 in progress.....




Inference number 692/791 in progress.....




Inference number 693/791 in progress.....




Inference number 694/791 in progress.....




Inference number 695/791 in progress.....




Inference number 696/791 in progress.....




Inference number 697/791 in progress.....




Inference number 698/791 in progress.....




Inference number 699/791 in progress.....




Inference number 700/791 in progress.....




Inference number 701/791 in progress.....




Inference number 702/791 in progress.....




Inference number 703/791 in progress.....




Inference number 704/791 in progress.....




Inference number 705/791 in progress.....




Inference number 706/791 in progress.....




Inference number 707/791 in progress.....




Inference number 708/791 in progress.....




Inference number 709/791 in progress.....




Inference number 710/791 in progress.....




Inference number 711/791 in progress.....




Inference number 712/791 in progress.....




Inference number 713/791 in progress.....




Inference number 714/791 in progress.....




Inference number 715/791 in progress.....




Inference number 716/791 in progress.....




Inference number 717/791 in progress.....




Inference number 718/791 in progress.....




Inference number 719/791 in progress.....




Inference number 720/791 in progress.....




Inference number 721/791 in progress.....




Inference number 722/791 in progress.....




Inference number 723/791 in progress.....




Inference number 724/791 in progress.....




Inference number 725/791 in progress.....




Inference number 726/791 in progress.....




Inference number 727/791 in progress.....




Inference number 728/791 in progress.....




Inference number 729/791 in progress.....




Inference number 730/791 in progress.....




Inference number 731/791 in progress.....




Inference number 732/791 in progress.....




Inference number 733/791 in progress.....




Inference number 734/791 in progress.....




Inference number 735/791 in progress.....




Inference number 736/791 in progress.....




Inference number 737/791 in progress.....




Inference number 738/791 in progress.....




Inference number 739/791 in progress.....




Inference number 740/791 in progress.....




Inference number 741/791 in progress.....




Inference number 742/791 in progress.....




Inference number 743/791 in progress.....




Inference number 744/791 in progress.....




Inference number 745/791 in progress.....




Inference number 746/791 in progress.....




Inference number 747/791 in progress.....




Inference number 748/791 in progress.....




Inference number 749/791 in progress.....




Inference number 750/791 in progress.....




Inference number 751/791 in progress.....




Inference number 752/791 in progress.....




Inference number 753/791 in progress.....




Inference number 754/791 in progress.....




Inference number 755/791 in progress.....




Inference number 756/791 in progress.....




Inference number 757/791 in progress.....




Inference number 758/791 in progress.....




Inference number 759/791 in progress.....




Inference number 760/791 in progress.....




Inference number 761/791 in progress.....




Inference number 762/791 in progress.....




Inference number 763/791 in progress.....




Inference number 764/791 in progress.....




Inference number 765/791 in progress.....




Inference number 766/791 in progress.....




Inference number 767/791 in progress.....




Inference number 768/791 in progress.....




Inference number 769/791 in progress.....




Inference number 770/791 in progress.....




Inference number 771/791 in progress.....




Inference number 772/791 in progress.....




Inference number 773/791 in progress.....




Inference number 774/791 in progress.....




Inference number 775/791 in progress.....




Inference number 776/791 in progress.....




Inference number 777/791 in progress.....




Inference number 778/791 in progress.....




Inference number 779/791 in progress.....




Inference number 780/791 in progress.....




Inference number 781/791 in progress.....




Inference number 782/791 in progress.....
Inference number 783/791 in progress.....
Inference number 784/791 in progress.....
Inference number 785/791 in progress.....
Inference number 786/791 in progress.....
Inference number 787/791 in progress.....
Inference number 788/791 in progress.....
Inference number 789/791 in progress.....
Inference number 790/791 in progress.....
Inference number 791/791 in progress.....


In [8]:
# import gradio as gr

# def predict(type, question):
#     if type == "Base":
#         ans = llm_chain.invoke({"context":"", "question": question})
#         return ans
#     else:
#         ans = rag_chain.invoke(question)
#         return ans    

# pred = gr.Interface(
#     fn=predict,
#     inputs=[
#         gr.Radio(['Base', 'Context'], label="Select One"),
#         gr.Textbox(label="Question"),
#     ],
#     outputs="text",
#     title="Retrieval Augumented Generation using zephyr-7b-beta"
# )

# pred.launch(share=True)