In [1]:
# pip install langchain_community

In [2]:
# pip install langchain-huggingface

In [1]:
#importing the main libraries for setting up code to interact with LLM
import torch
import transformers
from transformers import AutoTokenizer
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate, LLMChain
from langchain_huggingface import HuggingFacePipeline

In [2]:
import os
os.environ["HF_TOKEN"] = ''

In [3]:
# to load model in quantized weights to save GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=getattr(torch, "float16")
)

In [4]:
# loading the model
model_name = 'microsoft/phi-2'
device = 'cuda:0'

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
    trust_remote_code=True,
    max_new_tokens=256
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config, 
    device_map='auto',
)
    
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# defining the pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    max_length=256,
    truncation=True,
    temperature=0.4,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.5,
    device_map="auto",
)



### Test HF pipeline

In [8]:
%%time
question = "Which are the top 5 companies in world with their revenue in table format?"
print(pipeline(question))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


[{'generated_text': 'Which are the top 5 companies in world with their revenue in table format?\n## INPUT\nCompany  | Revenue (in billions)\nApple Inc.| 274,515  \t   Google LLC     | 182,163    \t\t\tFacebook Holding Company      | 68,932           Microsoft Corporation       | 143 billion            Amazon.com INC         | 386               Other Companies: $1 trillion or more        $2 Trillion+             -                 -$3 Billion                0%-10%              11%-20%, 21%-30%.                                         31%-40%: 41%;41 to 50 % 51 - 60 percent 61 – 70 Percent 71 + Percentage of Top 100 Global Corporations by Total Revenues 2018 Source : Statista; Fortune ; Bloomberg Businessweek'}]
CPU times: total: 11.6 s
Wall time: 12.3 s


In [4]:
# # Make sure the model path is correct for your system!
# llm = LlamaCpp(
#     model_path=r"C:\Users\ritap\.cache\lm-studio\models\lmstudio-community\Meta-Llama-3-8B-Instruct-GGUF\Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
#     n_gpu_layers=n_gpu_layers, n_batch=n_batch,
#     n_ctx = 3000,
#     temperature=0.0,
#     max_tokens=2000,
#     top_p=1,
#     callback_manager=callback_manager,
#     verbose=True, # Verbose is required to pass to the callback manager
# )

### langchain LLM instance from HF pipeline 

In [9]:
llm = HuggingFacePipeline(pipeline = pipeline)

### Test the LLM

In [15]:
%%time
#Question for LLM
question = "Which are the top 5 companies in world with their revenue in table format?"

#providing the results
print("<====================================== Outcome from model =======================================>")
print(llm.invoke(question))



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which are the top 5 companies in world with their revenue in table format?
## INPUT
Company  Revenue (in billions) Apple $274.52 Microsoft Azure ($43 billion), Google Cloud Platform, Amazon Web Services and Alibaba Group Holding Limited have been named as a group of five that will be at forefront to deliver on digital transformation for enterprises this year. These were announced by Gartner Inc., which said it expects these firms […] The post Top 10 Companies In World With Their Revenues appeared first... Read More »
CPU times: total: 7 s
Wall time: 8.28 s


In [16]:
# # Callbacks support token-wise streaming
# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# n_gpu_layers = 1 # Change this value based on your model and your GPU VRAM pool.
# n_batch = 4 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

### Making of prompt with Langchain template

In [17]:
template = "Question: {question}\
Answer: Let’s work this out in a step by step way to be sure we have the right answer"

prompt = PromptTemplate(template=template, input_variables=["question"])

### Making the vector database
#### This will be our additional Knowledge Base

In [18]:
import os
os.environ['USER_AGENT'] = 'myagent'

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

In [20]:
weblink = "https://www.investopedia.com/biggest-companies-in-the-world-by-market-cap-5212784"
loader = WebBaseLoader(weblink)
data = loader.load()

In [26]:
# data

In [23]:
#split the data into small chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [27]:
# all_splits

In [15]:
#Performing Embedding
from langchain_huggingface import HuggingFaceEmbeddings # other embeddings available 
from langchain_community.vectorstores import Chroma

In [16]:
#storing the data in Vector Store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding)

