In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', '')
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY', '')
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT', 'default')

In [11]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Large_language_model")
data = loader.load()

Load -> Docs -> Divide our text into chunks -> text -> vectors -> vector Embedding -> Vector Store DB

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(data)

In [14]:
len(docs)

130

In [15]:
docs1 = docs[:5]

In [16]:
docs1

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Large_language_model', 'title': 'Large language model - Wikipedia', 'language': 'en'}, page_content='Large language model - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\

In [17]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [18]:
from langchain_community.vectorstores import FAISS
vectorstoredb = FAISS.from_documents(docs1, embeddings)

In [19]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4.1-nano")
print("LLM initialized with model:", llm.model_name)

LLM initialized with model: gpt-4.1-nano


In [20]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template(
    """
You are a helpful assistant. Answer the question based on the provided context.
<context>
{context}
</context>
    """
)

document_chain = create_stuff_documents_chain(llm, prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nYou are a helpful assistant. Answer the question based on the provided context.\n<context>\n{context}\n</context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000012265F33340>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000012265F33250>, root_client=<openai.OpenAI object at 0x0000012265F33850>, root_async_client=<openai.AsyncOpenAI object at 0x0000012265F303A0>, model_name='gpt-4.1-nano', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser

In [None]:
from langchain_core.documents import Document

document_chain.invoke(
    {

    }
)

In [21]:
vectorstoredb

<langchain_community.vectorstores.faiss.FAISS at 0x1221a039360>

In [22]:
retriever=vectorstoredb.as_retriever()
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [23]:
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001221A039360>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nYou are a helpful assistant. Answer the question based on the provided context.\n<context>\n{context}\n</context>\n    '), additional_kwargs={})])


In [25]:
response = retrieval_chain.invoke({"input": "What is a large language model?"})

In [26]:
response["answer"]

'Based on the provided Wikipedia content, a large language model (LLM) is a type of artificial intelligence model designed to understand and generate human language. The article covers various aspects of LLMs, including their development history, dataset preprocessing steps such as tokenization and dataset cleaning, training architectures like reinforcement learning from human feedback, and considerations regarding their wider impact, including ethical and environmental concerns. The structure also addresses evaluation methods, properties, interpretation, and additional features like multimodality and reasoning abilities.'