## IMPORTING LIBRARIES

In [2]:
import os 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
import gradio as gr
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## LOADING THE DOCUMENT

In [3]:
pdf_path = "/Users/sidhaarthmurali/Desktop/Exela-Internship/Task-1-ChatBOT-for-PDFs/BERT_analysis.pdf"
loader = PyPDFLoader(file_path=pdf_path)
document = loader.load_and_split()
document

[Document(page_content='What Does BERT Look At?\nAn Analysis of BERT’s Attention\nKevin Clark†Urvashi Khandelwal†Omer Levy‡Christopher D. Manning†\n†Computer Science Department, Stanford University\n‡Facebook AI Research\n{kevclark,urvashik,manning }@cs.stanford.edu\nomerlevy@fb.com\nAbstract\nLarge pre-trained neural networks such as\nBERT have had great recent success in NLP,\nmotivating a growing body of research investi-\ngating what aspects of language they are able\nto learn from unlabeled data. Most recent anal-\nysis has focused on model outputs (e.g., lan-\nguage model surprisal) or internal vector rep-\nresentations (e.g., probing classiﬁers). Com-\nplementary to these works, we propose meth-\nods for analyzing the attention mechanisms of\npre-trained models and apply them to BERT.\nBERT’s attention heads exhibit patterns such\nas attending to delimiter tokens, speciﬁc po-\nsitional offsets, or broadly attending over the\nwhole sentence, with heads in the same layer\noften ex

In [4]:
len(document)

17

## SPLIT DOCUMENT INTO CHUNKS FOR LESSER API CALLS

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 30)
doc = text_splitter.split_documents(documents = document)

In [6]:
len(doc)

75

In [7]:
doc

[Document(page_content='What Does BERT Look At?\nAn Analysis of BERT’s Attention\nKevin Clark†Urvashi Khandelwal†Omer Levy‡Christopher D. Manning†\n†Computer Science Department, Stanford University\n‡Facebook AI Research\n{kevclark,urvashik,manning }@cs.stanford.edu\nomerlevy@fb.com\nAbstract\nLarge pre-trained neural networks such as\nBERT have had great recent success in NLP,\nmotivating a growing body of research investi-\ngating what aspects of language they are able\nto learn from unlabeled data. Most recent anal-\nysis has focused on model outputs (e.g., lan-\nguage model surprisal) or internal vector rep-', metadata={'source': '/Users/sidhaarthmurali/Desktop/Exela-Internship/Task-1-ChatBOT-for-PDFs/BERT_analysis.pdf', 'page': 0}),
 Document(page_content='resentations (e.g., probing classiﬁers). Com-\nplementary to these works, we propose meth-\nods for analyzing the attention mechanisms of\npre-trained models and apply them to BERT.\nBERT’s attention heads exhibit patterns such\

## HUGGINGFACE EMBEDDINGS

In [8]:

embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

## USING PINECONE AS A VECTORSTORE

In [9]:
pip install pinecone-client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
# import pinecone      

# pinecone.init(      
# 	api_key='ec5f7903-5240-4214-8104-1ab6cbf304fe',      
# 	environment='us-west1-gcp-free'      
# )      
# index = ('llama2pdf')

In [11]:
vectorstore = FAISS.from_documents(documents= doc, embedding= embeddings)
vectorstore.save_local("BERT-Learnings")
new_vectorstore = FAISS.load_local("BERT-Learnings", embeddings)

In [12]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format

In [13]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [14]:
pip install llama-cpp-python

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Loading model,
llm = LlamaCpp(
    model_path=model_path,
    max_tokens=512,
    verbose=False,
)

llama.cpp: loading model from /Users/sidhaarthmurali/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGML/snapshots/47d28ef5de4f3de523c421f325a2e4e039035bab/llama-2-13b-chat.ggmlv3.q5_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_head_kv  = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 9 (mostly Q5_1)
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.11 MB
llama_model

In [16]:
# from langchain.chains.question_answering import load_qa_chain
qa = RetrievalQA.from_chain_type(llm = llm, chain_type = "map_reduce", retriever = new_vectorstore.as_retriever())

In [17]:
with gr.Blocks() as app:
    chatbot = gr.Chatbot(label = "BERTchat")
    msg = gr.Textbox(placeholder="Ask me anything about BERT?")
    clear = gr.ClearButton([msg, chatbot])

In [18]:
with gr.Blocks() as app:
    chatbot = gr.Chatbot(label = "BERTchat")
    msg = gr.Textbox(placeholder="Ask me anything about BERT?")
    clear = gr.ClearButton([msg, chatbot])

    def respond(user_query, chat_history):
        bot_message = qa.run(user_query)
        chat_history.append((user_query, bot_message))
        return "", chat_history
        

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

In [19]:
app.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


