In [1]:
import os
import pickle
import langchain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

In [2]:
#Local LLM Initialization using HuggingFacePipeline
model_id = 'google/flan-t5-base'
DEVICE = "cpu"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id) 

# Wrap the model in a pipeline
pipe = pipeline(
    "text2text-generation", 
    model=model, 
    tokenizer=tokenizer,
    device=DEVICE,
)

# Pass valid generation arguments to the Langchain wrapper
llm = HuggingFacePipeline(
    pipeline=pipe,
    pipeline_kwargs={"max_new_tokens": 256, "do_sample": True, "temperature": 0.1}
)

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = "cpu"

Device set to use cpu


In [3]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

# Split the documents into chunks
docs = text_splitter.split_documents(data)
len(docs)

194

In [5]:
docs[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nTrending Stocks\n\nAdani Power INE814H01029, ADANIPOWER, 533096\n\nTata Motors INE155A01022, TATAMOTORS, 500570\n\nOla Electric INE0LXG01040, OLAELEC, 544225\n\nVodafone Idea INE669E01016, IDEA, 532822\n\nTCS INE467B01029, TCS, 532540\n\n\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nTopic\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll')

In [6]:
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={'device': DEVICE} 
)

vectorindex_hf_local = FAISS.from_documents(docs, embeddings)

In [7]:
# Storing vector index created in local
file_path="vector_index_hf_local.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_hf_local, f)

In [8]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [9]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [10]:
query = "what is the price of Tiago iCNG?"

langchain.debug = True

result = chain.invoke({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nfirst published: Aug 4, 2023 02:17 pm\n\nBusiness News,\n\nSensex, and\n\nNifty updates. Obtain\n\nPersonal Finance insights, tax queries, and expert opinions

Token indices sequence length is longer than the specified maximum sequence length for this model (1980 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1993 > 512). Running this sequence through the model will result in indexing errors


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?",
  "summaries": "Content: The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\nSource: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html\n\nContent: Rs 7.1 lakh\nSource: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html\n\nContent: Rs 7.1 lakh\nSource: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html\n\nContent: Text1: Tata Motors launches Punch iCNG, price starts at Rs 7.1 lakh Text2: Business Markets Stocks Economy Companies Trends IPO Opinion EV Special Home News Business Tata Motors launches Punc