In [15]:
import os
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
import pickle

In [16]:
loader = UnstructuredURLLoader(
    urls=[
        "https://en.wikipedia.org/wiki/MS_Dhoni",
        "https://www.espncricinfo.com/cricketers/ms-dhoni-28081"
    ]
)

document = loader.load()

In [18]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [20]:
docs = splitter.split_documents(document)

In [21]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)

  embeddings = HuggingFaceEmbeddings(


In [22]:
vectors = FAISS.from_documents(docs, embeddings)

In [23]:
with open("vectors_db.pkl","wb") as f:
    pickle.dump(vectors, f)

In [24]:
with open("vectors_db.pkl","rb") as f:
    loaded_vectors = pickle.load(f)

In [25]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0.5,
    max_tokens=100,
)

In [26]:
retriever = loaded_vectors.as_retriever()

In [28]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=retriever,
)

In [30]:
query="How many centuries has MS Dhoni scored?"
result = chain(query)
print(result['answer'])
print(result['sources'])

FINAL ANSWER: MS Dhoni has scored 17 international centuries.  He has scored 10 centuries in ODIs and 7 centuries in Test matches.

https://en.wikipedia.org/wiki/MS_Dhoni
