In [1]:
# pip install langchain_community

In [2]:
# pip install langchain-huggingface

In [1]:
#importing the main libraries for setting up code to interact with LLM
import torch
import transformers
from transformers import AutoTokenizer
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate, LLMChain
from langchain_huggingface import HuggingFacePipeline

In [2]:
import os
os.environ["HF_TOKEN"] = ''

In [3]:
# to load model in quantized weights to save GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=getattr(torch, "float16")
)

In [4]:
# loading the model
model_name = 'microsoft/phi-2'
device = 'cuda:0'

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
    trust_remote_code=True,
    max_new_tokens=1024
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config, 
    device_map=device,
)
    
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# defining the pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    max_length=4096,
    truncation=True,
    temperature=0.5,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.5,
    device_map="auto",
)



### Test HF pipeline

In [42]:
%%time
question = "Which are the top 5 companies in world with their revenue in table format?"
print(pipeline(question))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Which are the top 5 companies in world with their revenue in table format?\n## INPUT\nCompany | Revenue (in billions) \t\nApple Inc.| 274,515 \t\t  \nAmazon Web Services LLC | 143 billion   \nMicrosoft Corporation | 144 million     \nAlphabet Inc., Google's parent company - $182 Billion    \nFacebook, Meta Platforms Ltd.-$70-72Billion      \t\t\t                                         \u200b\n\n"}]
CPU times: total: 5.2 s
Wall time: 6.09 s


In [9]:
# # Make sure the model path is correct for your system!
# llm = LlamaCpp(
#     model_path=r"C:\Users\ritap\.cache\lm-studio\models\lmstudio-community\Meta-Llama-3-8B-Instruct-GGUF\Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
#     n_gpu_layers=n_gpu_layers, n_batch=n_batch,
#     n_ctx = 3000,
#     temperature=0.0,
#     max_tokens=2000,
#     top_p=1,
#     callback_manager=callback_manager,
#     verbose=True, # Verbose is required to pass to the callback manager
# )

### langchain LLM instance from HF pipeline 

In [6]:
llm = HuggingFacePipeline(pipeline = pipeline)

### Test the LLM

In [12]:
%%time
#Question for LLM
question = "Which are the top 5 companies in world with their revenue in table format?"

#providing the results
print("<====================================== Outcome from model =======================================>")
print(llm.invoke(question))



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which are the top 5 companies in world with their revenue in table format?
The five largest listed corporations by market capitalization as of June 30, 2018 were: Apple Inc. (Apple), Alphabet LLC/Google parent company Google GOOG -0.16%, Facebook FB 0.00%, Amazon AMZN +1%.40%), and Microsoft MSFT 1).
CPU times: total: 4.42 s
Wall time: 5.13 s


In [13]:
# # Callbacks support token-wise streaming
# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# n_gpu_layers = 1 # Change this value based on your model and your GPU VRAM pool.
# n_batch = 4 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

### Making of prompt with Langchain template

In [38]:
template = """Question: {question}\
Answer: Let’s work this out in a step by step way to be sure we have the right answer"""

prompt = PromptTemplate(template=template, input_variables=["question"])

### Making the vector database
#### This will be our additional Knowledge Base

In [7]:
import os
os.environ['USER_AGENT'] = 'myagent'

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

In [9]:
weblink = "https://www.investopedia.com/biggest-companies-in-the-world-by-market-cap-5212784"
loader = WebBaseLoader(weblink)
data = loader.load()

In [10]:
# data

In [11]:
#split the data into small chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [12]:
# all_splits

In [13]:
#Performing Embedding
from langchain_huggingface import HuggingFaceEmbeddings # other embeddings available 
from langchain_community.vectorstores import Chroma

In [14]:
#storing the data in Vector Store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding)



In [15]:
len(vectorstore)

11

## Performing a similarity searh on the vector database
### This is O(n) operation, so might not be feasable if the Database contain a lot of items.

In [16]:
question = "Which are the top 5 companies in world with their revenue in table format?"
docs = vectorstore.similarity_search(question)

In [17]:
print(f"This returns {len(docs)} items from the database with highest similarities")

This returns 4 items from the database with highest similarities


## Retrival Augmented Generation

In [18]:
from langchain.chains import RetrievalQA

In [19]:
retriever = vectorstore.as_retriever()

In [20]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [21]:
%%time
response = qa.run(question)
# full_responnse = f"{question}\n{response}"
# print(full_response)

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(




[1m> Entering new RetrievalQA chain...[0m


OutOfMemoryError: CUDA out of memory. Tried to allocate 42.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.71 GiB is allocated by PyTorch, and 316.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)