#**Llama 2+ Pinecone + LangChain**

##**Step 1: Install All the Required Pakages**

In [2]:
!pip install langchain
!pip install pypdf
!pip install unstructured
!pip install sentence_transformers
!pip install pinecone-client
!pip install llama-cpp-python
!pip install huggingface_hub



#**Step 2: Import All the Required Libraries**

In [3]:
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os

#**Step 3: Load the Data**

In [4]:
!gdown "https://drive.google.com/uc?id=15hUEJQViQDxu_fnJeO_Og1hGqykCmJut&confirm=t"

# !gdown "https://drive.google.com/file/d/1wh4tEM7BzcFhQQvQAuyr_f67TLbfrCXg/view"

Downloading...
From: https://drive.google.com/uc?id=15hUEJQViQDxu_fnJeO_Og1hGqykCmJut&confirm=t
To: /content/The-Field-Guide-to-Data-Science.pdf
100% 30.3M/30.3M [00:00<00:00, 32.5MB/s]


In [5]:
# loader = OnlinePDFLoader("/content/cs4349.pdf")
# loader = PyPDFLoader("/content/cs4349.pdf")
loader = PyPDFLoader("/content/19908___Introduction to Algorithms.pdf")

In [6]:
data = loader.load()

In [7]:
data

[Document(page_content='Instructor’s Manual\nby ThomasH. Cormen\ntoAccompany\nIntroductiontoAlgorithms\nThird Edition\nby ThomasH. Cormen\nCharles E. Leiserson\nRonald L. Rivest\nClifford Stein\nTheMITPress\nCambridge, Massachusetts London, England', metadata={'source': '/content/19908___Introduction to Algorithms.pdf', 'page': 0}),
 Document(page_content='Instructor’s Manual to Accompany Introduction to Algorithms , Third Edition\nby Thomas H.Cormen, Charles E.Leiserson, Ronald L.Rivest, and Clifford Stein\nPublished by the MIT Press. Copyright c\r2009 by The Massachusetts Institute of Technology. All righ ts\nreserved.\nNopart ofthis publication may bereproduced ordistributed inany formor byany means,orstored in adatabase\nor retrieval system, without the prior written consent of Th e MITPress, including, but not limited to, network or\nother electronic storage or transmission, or broadcast for distance learning.', metadata={'source': '/content/19908___Introduction to Algorithms.pdf'

In [8]:
loader2 = PyPDFLoader("/content/Introduction.to.Algorithms.4th.Edition.pdf")

In [9]:
data = loader2.load()

In [10]:
data

[Document(page_content='', metadata={'source': '/content/Introduction.to.Algorithms.4th.Edition.pdf', 'page': 0}),
 Document(page_content='Introduction to Algorithms\nFourth Edition', metadata={'source': '/content/Introduction.to.Algorithms.4th.Edition.pdf', 'page': 1}),
 Document(page_content='Thomas H. Cormen\nCharles E. Leiserson\nRonald L. Rivest\nClifford Stein\nIntroduction to Algorithms\nFourth Edition\nThe MIT Press\nCambridge, Massachusetts London, England', metadata={'source': '/content/Introduction.to.Algorithms.4th.Edition.pdf', 'page': 2}),
 Document(page_content='© 2022 Massachusetts Institute of Technology\nAll rights reserved. No part of this book may be reproduced in any form or by any electronic or\nmechanical means (including photocopying, recording, or information storage and retrieval)\nwithout permission in writing from the publisher.\nThe MIT Press would like to thank the anonymous peer reviewers who provided comments on\ndrafts of this book. The generous work of

#**Step 4: Split the Text into Chunks**

In [11]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

In [12]:
docs=text_splitter.split_documents(data)

In [13]:
len(docs)

6112

In [14]:
docs[0]

Document(page_content='Introduction to Algorithms\nFourth Edition', metadata={'source': '/content/Introduction.to.Algorithms.4th.Edition.pdf', 'page': 1})

#**Step 5: Setup the Environment**

In [15]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '657b2891-44ea-4b25-a99c-dd9d86f9c068')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

#**Step 6: Downlaod the Embeddings**

In [16]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

#**Step 7: Initializing the Pinecone**

In [17]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchainpinecone" # put in the name of your pinecone index here

#**Step 8: Create Embeddings for Each of the Text Chunk**

In [18]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

# If you already have an index, you can load it like this


In [19]:
#docsearch = Pinecone.from_existing_index(index_name, embeddings)


#**Step 9: Similarity Search**

In [20]:
#query="What are examples of good data science teams?"
query="who is the author of the introduction to algorithms 4th edition"

In [21]:
docs=docsearch.similarity_search(query)

In [22]:
docs

[Document(page_content='Introduction to Algorithms\nFourth Edition'),
 Document(page_content='Introduction to Algorithms\nFourth Edition'),
 Document(page_content='Thomas H. Cormen\nCharles E. Leiserson\nRonald L. Rivest\nClifford Stein\nIntroduction to Algorithms\nFourth Edition\nThe MIT Press\nCambridge, Massachusetts London, England'),
 Document(page_content='Thomas H. Cormen\nCharles E. Leiserson\nRonald L. Rivest\nClifford Stein\nIntroduction to Algorithms\nFourth Edition\nThe MIT Press\nCambridge, Massachusetts London, England')]

#**Step 9: Query the Docs to get the Answer Back (Llama 2 Model)**

In [23]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose

Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting scikit-build-core[pyproject]>=0.5.1
    Using cached scikit_build_core-0.7.0-py3-none-any.whl (136 kB)
  Collecting exceptiongroup (from scikit-build-core[pyproject]>=0.5.1)
    Using cached exceptiongroup-1.2.0-py3-none-any.whl (16 kB)
  Collecting packaging>=20.9 (from scikit-build-core[pyproject]>=0.5.1)
    Using cached packaging-23.2-py3-none-any.whl (53 kB)
  Collecting tomli>=1.1 (from scikit-build-core[pyproject]>=0.5.1)
    Using cached tomli-2.0.1-py3-none-any.whl (12 kB)
  Collecting pathspec>=0.10.1 (from scikit-build-core[pyproject]>=0.

#Import All the Required Libraries

In [24]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
from langchain.chains.question_answering import load_qa_chain

In [25]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

#  Quantized Models from the Hugging Face Community

The Hugging Face community provides quantized models, which allow us to efficiently and effectively utilize the model on the T4 GPU. It is important to consult reliable sources before using any model.

There are several variations available, but the ones that interest us are based on the GGLM library.

We can see the different variations that Llama-2-13B-GGML has [here](https://huggingface.co/models?search=llama%202%20ggml).



In this case, we will use the model called [Llama-2-13B-chat-GGML](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML).

 Quantization reduces precision to optimize resource usage.

Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and activations with low-precision data types like 8-bit integer ( int8 ) instead of the usual 32-bit floating point ( float32 ).

In [38]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format

# model_name_or_path = "TheBloke/CodeLlama-13B-Python-GGUF"
# model_basename = "codellama-13b-python.Q5_K_M.gguf"

In [39]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [28]:
#  pip install llama-cpp-python==0.1.78 and numpy==1.23.4

Collecting llama-cpp-python==0.1.78
  Using cached llama_cpp_python-0.1.78-cp310-cp310-linux_x86_64.whl
Collecting numpy==1.23.4
  Using cached numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy, llama-cpp-python
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.2
    Uninstalling numpy-1.26.2:
      Successfully uninstalled numpy-1.26.2
  Attempting uninstall: llama-cpp-python
    Found existing installation: llama_cpp_python 0.2.20
    Uninstalling llama_cpp_python-0.2.20:
      Successfully uninstalled llama_cpp_python-0.2.20
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires u

In [40]:
n_gpu_layers=40  # Change this value based on your model and your GPU VRAM pool.
n_batch=256  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Loading model,
llm = LlamaCpp(
    model_path=model_path,
    max_tokens=256,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    n_ctx=1024,
    verbose=True,
)

# llm = LlamaCpp(
#   model_path=model_path,
#   max_tokens=256,
#   n_gpu_layers=n_gpu_layers,
#   n_batch=n_batch,
#   callback_manager=callback_manager,
#   n_ctx=1024,
#   verbose=True,
# )

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


In [41]:
chain=load_qa_chain(llm, chain_type="stuff")

In [42]:
query="who is the author of the introduction to algorithms 4th edition"
docs=docsearch.similarity_search(query)

In [43]:
docs

[Document(page_content='Introduction to Algorithms\nFourth Edition'),
 Document(page_content='Introduction to Algorithms\nFourth Edition'),
 Document(page_content='Thomas H. Cormen\nCharles E. Leiserson\nRonald L. Rivest\nClifford Stein\nIntroduction to Algorithms\nFourth Edition\nThe MIT Press\nCambridge, Massachusetts London, England'),
 Document(page_content='Thomas H. Cormen\nCharles E. Leiserson\nRonald L. Rivest\nClifford Stein\nIntroduction to Algorithms\nFourth Edition\nThe MIT Press\nCambridge, Massachusetts London, England')]

In [44]:
chain.run(input_documents=docs, question=query)

 The authors are Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein.

' The authors are Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein.'

In [45]:
query="who is the author of the introduction to algorithms 4th edition"
docs=docsearch.similarity_search(query)

In [46]:
chain.run(input_documents=docs, question=query)

Llama.generate: prefix-match hit


 Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein are the authors of Introduction to Algorithms, 4th Edition.

' Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein are the authors of Introduction to Algorithms, 4th Edition.'

#**Step 10: Query the Docs to get the Answer Back (Hugging Face Model)**

In [None]:
from langchain.llms import HuggingFaceHub

In [None]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

In [None]:
chain=load_qa_chain(llm, chain_type="stuff")

In [None]:
query="What are examples of good data science teams?"
docs=docsearch.similarity_search(query)

In [None]:
chain.run(input_documents=docs, question=query)

'Data Science teams need a broad view of the organization. Leaders must be key advocates who meet'