In [42]:
from dotenv import load_dotenv
load_dotenv()
import os

groq_api_key = os.getenv("GROQ_API_KEY")

## Data Ingestion

In [43]:
from langchain_community.document_loaders import WebBaseLoader

URL = "https://www.bbc.com/news/articles/c5y8nr1dg54o"
webLoader = WebBaseLoader(URL)
docs = webLoader.load()
docs 



In [44]:
docs[0].page_content



### Chunking

In [45]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 50)

In [46]:
splitted_docs = text_splitter.split_documents(docs)

In [47]:
splitted_docs

[Document(metadata={'source': 'https://www.bbc.com/news/articles/c5y8nr1dg54o', 'title': "World's first resort hospital opens in global gambling hub Macau", 'description': 'Macau is trying to diversify its economy away from gaming to become a major centre for healthcare tourism.', 'language': 'en-GB'}, page_content="World's first resort hospital opens in global gambling hub MacauSkip to contentBritish Broadcasting CorporationHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and 

In [48]:
for docs in splitted_docs:
    print(docs.page_content + "\n\n") 

World's first resort hospital opens in global gambling hub MacauSkip to contentBritish Broadcasting CorporationHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListTo the Ends of The Earth EarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingAudioPodcast CategoriesRadioAudio FAQsVideoBBC MaestroLiveLive




### Embed the content

In [49]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

google_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001", 
    google_api_key=os.getenv("GEMINI_API_KEY"))

### Configure vector store DB and similarity search

In [50]:
from langchain_community.vectorstores import FAISS
vectorstoreDB = FAISS.from_documents(splitted_docs, google_embeddings)

In [51]:
similarity_search_results = vectorstoreDB.similarity_search("Where is Macau?", k = 2)

In [60]:
similarity_search_results

[Document(id='d1e36c8b-2d85-4342-98f8-35e207ac9f09', metadata={'source': 'https://www.bbc.com/news/articles/c5y8nr1dg54o', 'title': "World's first resort hospital opens in global gambling hub Macau", 'description': 'Macau is trying to diversify its economy away from gaming to become a major centre for healthcare tourism.', 'language': 'en-GB'}, page_content="World's first resort hospital opens in global gambling hub MacauSkip to contentBritish Broadcasting CorporationHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDes

### LLM

In [None]:
from langchain_groq import ChatGroq
llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0.1,
    max_tokens=1024,
    reasoning_format="parsed"
)

### Retriever and Document Chain

In [54]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(
    """
    Answer the following questions based on the provided context:
    <context>
    {context}
    </context>
    """
)

document_chain = create_stuff_documents_chain(llm, prompt_template)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following questions based on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x000002E5FB88DE10>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000002E5FB88E510>, model_name='openai/gpt-oss-120b', temperature=0.1, reasoning_format='parsed', model_kwargs={}, groq_api_key=SecretStr('**********'), max_tokens=1024)
| StrOutputParser(), kwargs={}, config={'run_name': 'stuff_documents_chain'}, config_factories=[])

### create a retriever out of vector store DB 

In [55]:
retriever = vectorstoreDB.as_retriever()

In [56]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [57]:
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002E5FB863FD0>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following questions based on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwa

In [61]:
response = retrieval_chain.invoke({"input": "Tell me about the resort hospital in Macau."})
response['answer']

'I’m ready to help, but I don’t see any specific questions listed after the prompt. Could you please provide the questions you’d like answered based on the context?'