In [1]:
import json

from langchain_core.documents import Document

def f1(fname): # load json to documents in llm - dataloading
    with open(fname) as fobj:
        data = json.load(fobj)

    docs = []
    for var in data:
        content = var.get("content"," ")
        metadata_result = {k: v for k,v, in var.items() if k != 'content'}
        docs.append(Document(page_content=content,metadata=metadata_result))
    return docs
        

In [2]:
data = f1('data.json')
data

[Document(metadata={'title': 'LangChain Overview'}, page_content='LangChain is a framework for developing applications powered by language models.'),
 Document(metadata={'title': 'What is Ollama?'}, page_content='Ollama allows running large language models locally like LLaMA and Mistral.'),
 Document(metadata={'title': 'Embeddings in NLP'}, page_content='Embeddings are vector representations of text used for similarity and retrieval.')]

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [4]:
splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=50)
splitted_docs = splitter.split_documents(data)

embedding = OllamaEmbeddings(model="gemma:2b")

vectordb = Chroma.from_documents(splitted_docs,embedding=embedding)
retriver = vectordb.as_retriever()

  embedding = OllamaEmbeddings(model="gemma:2b")


In [5]:
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain 
from langchain.chains import create_retrieval_chain
from langchain_community.llms import Ollama

In [6]:
# define your prompt
prompt_obj = PromptTemplate.from_template('''You are an expert assistant {context} Question:{input} Answer:''')
# connect llm
llm_obj = Ollama(model='gemma:2b')

# create stuff chain to combine retrived docs
doc_chain = create_stuff_documents_chain(llm=llm_obj,prompt=prompt_obj)
# create retrieved chain
rag_chain = create_retrieval_chain(retriever=retriver,combine_docs_chain=doc_chain)

# invoke query
response = rag_chain.invoke({'input':'what is langchain?'})
# display response
print(response)

  llm_obj = Ollama(model='gemma:2b')


{'input': 'what is langchain?', 'context': [Document(metadata={'title': 'LangChain Overview'}, page_content='LangChain is a framework for developing applications powered by language models.'), Document(metadata={'title': 'What is Ollama?'}, page_content='Ollama allows running large language models locally like LLaMA and Mistral.'), Document(metadata={'title': 'Embeddings in NLP'}, page_content='Embeddings are vector representations of text used for similarity and retrieval.')], 'answer': "Sure, here's the answer to your question:\n\n**LangChain** is a framework for developing applications powered by large language models (LLM's). It offers a robust platform for building, training, and deploying robust language-based solutions.\n\nLangChain provides various tools and features that facilitate the development of diverse applications. These include:\n\n* **Ollama:** An API-based library for running LLaMA and Mistral models locally.\n* **Embeddings:** Vector representations of text for simi

import pprint
pprint.pprint(response)

In [8]:
pprint.pprint(response['answer'])

("Sure, here's the answer to your question:\n"
 '\n'
 '**LangChain** is a framework for developing applications powered by large '
 "language models (LLM's). It offers a robust platform for building, training, "
 'and deploying robust language-based solutions.\n'
 '\n'
 'LangChain provides various tools and features that facilitate the '
 'development of diverse applications. These include:\n'
 '\n'
 '* **Ollama:** An API-based library for running LLaMA and Mistral models '
 'locally.\n'
 '* **Embeddings:** Vector representations of text for similarity and '
 'retrieval.\n'
 '* **Data Management:** A comprehensive data pipeline for loading, cleaning, '
 'and transforming data.\n'
 '* **Application Programming Interface (API):** A well-defined API that '
 'allows developers to integrate LLM solutions seamlessly into existing '
 'workflows.\n'
 '* **Community Resources:** A vibrant community of developers and users, '
 'providing support, resources, and collaboration opportunities.\n'
 '

In [9]:
from langchain_core.prompts import ChatPromptTemplate
chat_prompt = ChatPromptTemplate.from_messages([('system','you are exper assistant. use the below context to answer the question'),
                                  ('human','content:\n{context}\n\nQuestion: {input}')])

llmobj = Ollama(model="gemma:2b")

doc_chain = create_stuff_documents_chain(llm=llmobj,prompt=chat_prompt)

rag_chain = create_retrieval_chain(retriever=retriver,combine_docs_chain=doc_chain)

# invoke query
response = rag_chain.invoke({'input':'what is langchain?'})
# display response
print(response)


{'input': 'what is langchain?', 'context': [Document(metadata={'title': 'LangChain Overview'}, page_content='LangChain is a framework for developing applications powered by language models.'), Document(metadata={'title': 'What is Ollama?'}, page_content='Ollama allows running large language models locally like LLaMA and Mistral.'), Document(metadata={'title': 'Embeddings in NLP'}, page_content='Embeddings are vector representations of text used for similarity and retrieval.')], 'answer': 'The context does not provide any information about what langchain is, so I cannot answer this question from the provided context.'}


In [10]:
# Use webase loader 
# https://lilianweng.github.io/posts/2023-06-23-agent/
# -----------------------------------------------------
# data loading - webaseload ->bs4.SoupStrainer(class_=("post-title","post-content","post-header"))
from langchain_community.document_loaders import WebBaseLoader 
import bs4

url='https://lilianweng.github.io/posts/2023-06-23-agent/'
loader = WebBaseLoader(web_path=(url),bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=("post-title","post-content","post-header")),))
docs = loader.load()
print(type(docs),len(docs))

USER_AGENT environment variable not set, consider setting it to identify your requests.


<class 'list'> 1


In [11]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [12]:
import bs4

In [13]:
url='https://lilianweng.github.io/posts/2023-06-23-agent/'
loader = WebBaseLoader(web_path=(url),bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=("post-content")),))
docs = loader.load()
print(type(docs),len(docs))

<class 'list'> 1


In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [15]:
#from langchain.embeddings import HuggingFaceEmbeddings
#embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#InvalidArgumentError: Collection expecting embedding with dimension of 2048, got 384

# embeddings = OllamaEmbeddings(model="gemma:2b")

In [16]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print(embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False


In [21]:
# vectorstore = Chroma.from_documents(documents=splits,embedding=embeddings)
# InvalidArgumentError: Collection expecting embedding with dimension of 2048, got 384

In [None]:
retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([('system','you are AI assist\n{context} Use 
                                             the context to answer the question'),
                                 ('human','{input}')
                                 ])

# connect llm
llm_obj = Ollama(model='gemma:2b')

qa_chain = create_stuff_documents_chain(llm_obj,prompt)
ret_chain = create_retrieval_chain(retriver,qa_chain)

response = ret_chain.invoke({'input':'what is Self-Reflection'})
print(response['answer'])

## Refer this OU-GEN-LLM-4-Activity-WebbaseLoader.ipynb Jupyter file
## 
https://github.com/Palanikarthikeyan/OU_GenAI_July_2025-/blob/main/DAY4/OU-GEN-LLM-4-Activity-WebbaseLoader.ipynb
